Home | History | Annotate | Download | only in pixman
      1 /*
      2  * Copyright  2008 Rodrigo Kumpera
      3  * Copyright  2008 Andr Tupinamb
      4  *
      5  * Permission to use, copy, modify, distribute, and sell this software and its
      6  * documentation for any purpose is hereby granted without fee, provided that
      7  * the above copyright notice appear in all copies and that both that
      8  * copyright notice and this permission notice appear in supporting
      9  * documentation, and that the name of Red Hat not be used in advertising or
     10  * publicity pertaining to distribution of the software without specific,
     11  * written prior permission.  Red Hat makes no representations about the
     12  * suitability of this software for any purpose.  It is provided "as is"
     13  * without express or implied warranty.
     14  *
     15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     22  * SOFTWARE.
     23  *
     24  * Author:  Rodrigo Kumpera (kumpera (at) gmail.com)
     25  *          Andr Tupinamb (andrelrt (at) gmail.com)
     26  *
     27  * Based on work by Owen Taylor and Sren Sandmann
     28  */
     29 #ifdef HAVE_CONFIG_H
     30 #include <config.h>
     31 #endif
     32 
     33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
     34 #include <emmintrin.h> /* for SSE2 intrinsics */
     35 #include "pixman-private.h"
     36 #include "pixman-combine32.h"
     37 #include "pixman-inlines.h"
     38 
     39 static __m128i mask_0080;
     40 static __m128i mask_00ff;
     41 static __m128i mask_0101;
     42 static __m128i mask_ffff;
     43 static __m128i mask_ff000000;
     44 static __m128i mask_alpha;
     45 
     46 static __m128i mask_565_r;
     47 static __m128i mask_565_g1, mask_565_g2;
     48 static __m128i mask_565_b;
     49 static __m128i mask_red;
     50 static __m128i mask_green;
     51 static __m128i mask_blue;
     52 
     53 static __m128i mask_565_fix_rb;
     54 static __m128i mask_565_fix_g;
     55 
     56 static __m128i mask_565_rb;
     57 static __m128i mask_565_pack_multiplier;
     58 
     59 static force_inline __m128i
     60 unpack_32_1x128 (uint32_t data)
     61 {
     62     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
     63 }
     64 
     65 static force_inline void
     66 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
     67 {
     68     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
     69     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
     70 }
     71 
     72 static force_inline __m128i
     73 unpack_565_to_8888 (__m128i lo)
     74 {
     75     __m128i r, g, b, rb, t;
     76 
     77     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
     78     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
     79     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
     80 
     81     rb = _mm_or_si128 (r, b);
     82     t  = _mm_and_si128 (rb, mask_565_fix_rb);
     83     t  = _mm_srli_epi32 (t, 5);
     84     rb = _mm_or_si128 (rb, t);
     85 
     86     t  = _mm_and_si128 (g, mask_565_fix_g);
     87     t  = _mm_srli_epi32 (t, 6);
     88     g  = _mm_or_si128 (g, t);
     89 
     90     return _mm_or_si128 (rb, g);
     91 }
     92 
     93 static force_inline void
     94 unpack_565_128_4x128 (__m128i  data,
     95                       __m128i* data0,
     96                       __m128i* data1,
     97                       __m128i* data2,
     98                       __m128i* data3)
     99 {
    100     __m128i lo, hi;
    101 
    102     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
    103     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
    104 
    105     lo = unpack_565_to_8888 (lo);
    106     hi = unpack_565_to_8888 (hi);
    107 
    108     unpack_128_2x128 (lo, data0, data1);
    109     unpack_128_2x128 (hi, data2, data3);
    110 }
    111 
    112 static force_inline uint16_t
    113 pack_565_32_16 (uint32_t pixel)
    114 {
    115     return (uint16_t) (((pixel >> 8) & 0xf800) |
    116 		       ((pixel >> 5) & 0x07e0) |
    117 		       ((pixel >> 3) & 0x001f));
    118 }
    119 
    120 static force_inline __m128i
    121 pack_2x128_128 (__m128i lo, __m128i hi)
    122 {
    123     return _mm_packus_epi16 (lo, hi);
    124 }
    125 
    126 static force_inline __m128i
    127 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
    128 {
    129     __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
    130     __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
    131 
    132     __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
    133     __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
    134 
    135     __m128i g0 = _mm_and_si128 (lo, mask_green);
    136     __m128i g1 = _mm_and_si128 (hi, mask_green);
    137 
    138     t0 = _mm_or_si128 (t0, g0);
    139     t1 = _mm_or_si128 (t1, g1);
    140 
    141     /* Simulates _mm_packus_epi32 */
    142     t0 = _mm_slli_epi32 (t0, 16 - 5);
    143     t1 = _mm_slli_epi32 (t1, 16 - 5);
    144     t0 = _mm_srai_epi32 (t0, 16);
    145     t1 = _mm_srai_epi32 (t1, 16);
    146     return _mm_packs_epi32 (t0, t1);
    147 }
    148 
    149 static force_inline __m128i
    150 pack_565_2x128_128 (__m128i lo, __m128i hi)
    151 {
    152     __m128i data;
    153     __m128i r, g1, g2, b;
    154 
    155     data = pack_2x128_128 (lo, hi);
    156 
    157     r  = _mm_and_si128 (data, mask_565_r);
    158     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
    159     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
    160     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
    161 
    162     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
    163 }
    164 
    165 static force_inline __m128i
    166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
    167 {
    168     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
    169 			     pack_565_2x128_128 (*xmm2, *xmm3));
    170 }
    171 
    172 static force_inline int
    173 is_opaque (__m128i x)
    174 {
    175     __m128i ffs = _mm_cmpeq_epi8 (x, x);
    176 
    177     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
    178 }
    179 
    180 static force_inline int
    181 is_zero (__m128i x)
    182 {
    183     return _mm_movemask_epi8 (
    184 	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
    185 }
    186 
    187 static force_inline int
    188 is_transparent (__m128i x)
    189 {
    190     return (_mm_movemask_epi8 (
    191 		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
    192 }
    193 
    194 static force_inline __m128i
    195 expand_pixel_32_1x128 (uint32_t data)
    196 {
    197     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
    198 }
    199 
    200 static force_inline __m128i
    201 expand_alpha_1x128 (__m128i data)
    202 {
    203     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
    204 						     _MM_SHUFFLE (3, 3, 3, 3)),
    205 				_MM_SHUFFLE (3, 3, 3, 3));
    206 }
    207 
    208 static force_inline void
    209 expand_alpha_2x128 (__m128i  data_lo,
    210                     __m128i  data_hi,
    211                     __m128i* alpha_lo,
    212                     __m128i* alpha_hi)
    213 {
    214     __m128i lo, hi;
    215 
    216     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
    217     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
    218 
    219     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
    220     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
    221 }
    222 
    223 static force_inline void
    224 expand_alpha_rev_2x128 (__m128i  data_lo,
    225                         __m128i  data_hi,
    226                         __m128i* alpha_lo,
    227                         __m128i* alpha_hi)
    228 {
    229     __m128i lo, hi;
    230 
    231     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
    232     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
    233     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
    234     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
    235 }
    236 
    237 static force_inline void
    238 pix_multiply_2x128 (__m128i* data_lo,
    239                     __m128i* data_hi,
    240                     __m128i* alpha_lo,
    241                     __m128i* alpha_hi,
    242                     __m128i* ret_lo,
    243                     __m128i* ret_hi)
    244 {
    245     __m128i lo, hi;
    246 
    247     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
    248     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
    249     lo = _mm_adds_epu16 (lo, mask_0080);
    250     hi = _mm_adds_epu16 (hi, mask_0080);
    251     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
    252     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
    253 }
    254 
    255 static force_inline void
    256 pix_add_multiply_2x128 (__m128i* src_lo,
    257                         __m128i* src_hi,
    258                         __m128i* alpha_dst_lo,
    259                         __m128i* alpha_dst_hi,
    260                         __m128i* dst_lo,
    261                         __m128i* dst_hi,
    262                         __m128i* alpha_src_lo,
    263                         __m128i* alpha_src_hi,
    264                         __m128i* ret_lo,
    265                         __m128i* ret_hi)
    266 {
    267     __m128i t1_lo, t1_hi;
    268     __m128i t2_lo, t2_hi;
    269 
    270     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
    271     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
    272 
    273     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
    274     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
    275 }
    276 
    277 static force_inline void
    278 negate_2x128 (__m128i  data_lo,
    279               __m128i  data_hi,
    280               __m128i* neg_lo,
    281               __m128i* neg_hi)
    282 {
    283     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
    284     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
    285 }
    286 
    287 static force_inline void
    288 invert_colors_2x128 (__m128i  data_lo,
    289                      __m128i  data_hi,
    290                      __m128i* inv_lo,
    291                      __m128i* inv_hi)
    292 {
    293     __m128i lo, hi;
    294 
    295     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
    296     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
    297     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
    298     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
    299 }
    300 
    301 static force_inline void
    302 over_2x128 (__m128i* src_lo,
    303             __m128i* src_hi,
    304             __m128i* alpha_lo,
    305             __m128i* alpha_hi,
    306             __m128i* dst_lo,
    307             __m128i* dst_hi)
    308 {
    309     __m128i t1, t2;
    310 
    311     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
    312 
    313     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
    314 
    315     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
    316     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
    317 }
    318 
    319 static force_inline void
    320 over_rev_non_pre_2x128 (__m128i  src_lo,
    321                         __m128i  src_hi,
    322                         __m128i* dst_lo,
    323                         __m128i* dst_hi)
    324 {
    325     __m128i lo, hi;
    326     __m128i alpha_lo, alpha_hi;
    327 
    328     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
    329 
    330     lo = _mm_or_si128 (alpha_lo, mask_alpha);
    331     hi = _mm_or_si128 (alpha_hi, mask_alpha);
    332 
    333     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
    334 
    335     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
    336 
    337     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
    338 }
    339 
    340 static force_inline void
    341 in_over_2x128 (__m128i* src_lo,
    342                __m128i* src_hi,
    343                __m128i* alpha_lo,
    344                __m128i* alpha_hi,
    345                __m128i* mask_lo,
    346                __m128i* mask_hi,
    347                __m128i* dst_lo,
    348                __m128i* dst_hi)
    349 {
    350     __m128i s_lo, s_hi;
    351     __m128i a_lo, a_hi;
    352 
    353     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
    354     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
    355 
    356     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
    357 }
    358 
    359 /* load 4 pixels from a 16-byte boundary aligned address */
    360 static force_inline __m128i
    361 load_128_aligned (__m128i* src)
    362 {
    363     return _mm_load_si128 (src);
    364 }
    365 
    366 /* load 4 pixels from a unaligned address */
    367 static force_inline __m128i
    368 load_128_unaligned (const __m128i* src)
    369 {
    370     return _mm_loadu_si128 (src);
    371 }
    372 
    373 /* save 4 pixels using Write Combining memory on a 16-byte
    374  * boundary aligned address
    375  */
    376 static force_inline void
    377 save_128_write_combining (__m128i* dst,
    378                           __m128i  data)
    379 {
    380     _mm_stream_si128 (dst, data);
    381 }
    382 
    383 /* save 4 pixels on a 16-byte boundary aligned address */
    384 static force_inline void
    385 save_128_aligned (__m128i* dst,
    386                   __m128i  data)
    387 {
    388     _mm_store_si128 (dst, data);
    389 }
    390 
    391 /* save 4 pixels on a unaligned address */
    392 static force_inline void
    393 save_128_unaligned (__m128i* dst,
    394                     __m128i  data)
    395 {
    396     _mm_storeu_si128 (dst, data);
    397 }
    398 
    399 static force_inline __m128i
    400 load_32_1x128 (uint32_t data)
    401 {
    402     return _mm_cvtsi32_si128 (data);
    403 }
    404 
    405 static force_inline __m128i
    406 expand_alpha_rev_1x128 (__m128i data)
    407 {
    408     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
    409 }
    410 
    411 static force_inline __m128i
    412 expand_pixel_8_1x128 (uint8_t data)
    413 {
    414     return _mm_shufflelo_epi16 (
    415 	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
    416 }
    417 
    418 static force_inline __m128i
    419 pix_multiply_1x128 (__m128i data,
    420 		    __m128i alpha)
    421 {
    422     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
    423 					    mask_0080),
    424 			    mask_0101);
    425 }
    426 
    427 static force_inline __m128i
    428 pix_add_multiply_1x128 (__m128i* src,
    429 			__m128i* alpha_dst,
    430 			__m128i* dst,
    431 			__m128i* alpha_src)
    432 {
    433     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
    434     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
    435 
    436     return _mm_adds_epu8 (t1, t2);
    437 }
    438 
    439 static force_inline __m128i
    440 negate_1x128 (__m128i data)
    441 {
    442     return _mm_xor_si128 (data, mask_00ff);
    443 }
    444 
    445 static force_inline __m128i
    446 invert_colors_1x128 (__m128i data)
    447 {
    448     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
    449 }
    450 
    451 static force_inline __m128i
    452 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
    453 {
    454     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
    455 }
    456 
    457 static force_inline __m128i
    458 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
    459 {
    460     return over_1x128 (pix_multiply_1x128 (*src, *mask),
    461 		       pix_multiply_1x128 (*alpha, *mask),
    462 		       *dst);
    463 }
    464 
    465 static force_inline __m128i
    466 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
    467 {
    468     __m128i alpha = expand_alpha_1x128 (src);
    469 
    470     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
    471 					   _mm_or_si128 (alpha, mask_alpha)),
    472 		       alpha,
    473 		       dst);
    474 }
    475 
    476 static force_inline uint32_t
    477 pack_1x128_32 (__m128i data)
    478 {
    479     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
    480 }
    481 
    482 static force_inline __m128i
    483 expand565_16_1x128 (uint16_t pixel)
    484 {
    485     __m128i m = _mm_cvtsi32_si128 (pixel);
    486 
    487     m = unpack_565_to_8888 (m);
    488 
    489     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
    490 }
    491 
    492 static force_inline uint32_t
    493 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
    494 {
    495     uint8_t a;
    496     __m128i xmms;
    497 
    498     a = src >> 24;
    499 
    500     if (a == 0xff)
    501     {
    502 	return src;
    503     }
    504     else if (src)
    505     {
    506 	xmms = unpack_32_1x128 (src);
    507 	return pack_1x128_32 (
    508 	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
    509 			unpack_32_1x128 (dst)));
    510     }
    511 
    512     return dst;
    513 }
    514 
    515 static force_inline uint32_t
    516 combine1 (const uint32_t *ps, const uint32_t *pm)
    517 {
    518     uint32_t s = *ps;
    519 
    520     if (pm)
    521     {
    522 	__m128i ms, mm;
    523 
    524 	mm = unpack_32_1x128 (*pm);
    525 	mm = expand_alpha_1x128 (mm);
    526 
    527 	ms = unpack_32_1x128 (s);
    528 	ms = pix_multiply_1x128 (ms, mm);
    529 
    530 	s = pack_1x128_32 (ms);
    531     }
    532 
    533     return s;
    534 }
    535 
    536 static force_inline __m128i
    537 combine4 (const __m128i *ps, const __m128i *pm)
    538 {
    539     __m128i xmm_src_lo, xmm_src_hi;
    540     __m128i xmm_msk_lo, xmm_msk_hi;
    541     __m128i s;
    542 
    543     if (pm)
    544     {
    545 	xmm_msk_lo = load_128_unaligned (pm);
    546 
    547 	if (is_transparent (xmm_msk_lo))
    548 	    return _mm_setzero_si128 ();
    549     }
    550 
    551     s = load_128_unaligned (ps);
    552 
    553     if (pm)
    554     {
    555 	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
    556 	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
    557 
    558 	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
    559 
    560 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    561 			    &xmm_msk_lo, &xmm_msk_hi,
    562 			    &xmm_src_lo, &xmm_src_hi);
    563 
    564 	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
    565     }
    566 
    567     return s;
    568 }
    569 
    570 static force_inline void
    571 core_combine_over_u_sse2_mask (uint32_t *	  pd,
    572 			       const uint32_t*    ps,
    573 			       const uint32_t*    pm,
    574 			       int                w)
    575 {
    576     uint32_t s, d;
    577 
    578     /* Align dst on a 16-byte boundary */
    579     while (w && ((uintptr_t)pd & 15))
    580     {
    581 	d = *pd;
    582 	s = combine1 (ps, pm);
    583 
    584 	if (s)
    585 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
    586 	pd++;
    587 	ps++;
    588 	pm++;
    589 	w--;
    590     }
    591 
    592     while (w >= 4)
    593     {
    594 	__m128i mask = load_128_unaligned ((__m128i *)pm);
    595 
    596 	if (!is_zero (mask))
    597 	{
    598 	    __m128i src;
    599 	    __m128i src_hi, src_lo;
    600 	    __m128i mask_hi, mask_lo;
    601 	    __m128i alpha_hi, alpha_lo;
    602 
    603 	    src = load_128_unaligned ((__m128i *)ps);
    604 
    605 	    if (is_opaque (_mm_and_si128 (src, mask)))
    606 	    {
    607 		save_128_aligned ((__m128i *)pd, src);
    608 	    }
    609 	    else
    610 	    {
    611 		__m128i dst = load_128_aligned ((__m128i *)pd);
    612 		__m128i dst_hi, dst_lo;
    613 
    614 		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
    615 		unpack_128_2x128 (src, &src_lo, &src_hi);
    616 
    617 		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
    618 		pix_multiply_2x128 (&src_lo, &src_hi,
    619 				    &mask_lo, &mask_hi,
    620 				    &src_lo, &src_hi);
    621 
    622 		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
    623 
    624 		expand_alpha_2x128 (src_lo, src_hi,
    625 				    &alpha_lo, &alpha_hi);
    626 
    627 		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
    628 			    &dst_lo, &dst_hi);
    629 
    630 		save_128_aligned (
    631 		    (__m128i *)pd,
    632 		    pack_2x128_128 (dst_lo, dst_hi));
    633 	    }
    634 	}
    635 
    636 	pm += 4;
    637 	ps += 4;
    638 	pd += 4;
    639 	w -= 4;
    640     }
    641     while (w)
    642     {
    643 	d = *pd;
    644 	s = combine1 (ps, pm);
    645 
    646 	if (s)
    647 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
    648 	pd++;
    649 	ps++;
    650 	pm++;
    651 
    652 	w--;
    653     }
    654 }
    655 
    656 static force_inline void
    657 core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
    658 				  const uint32_t*    ps,
    659 				  int                w)
    660 {
    661     uint32_t s, d;
    662 
    663     /* Align dst on a 16-byte boundary */
    664     while (w && ((uintptr_t)pd & 15))
    665     {
    666 	d = *pd;
    667 	s = *ps;
    668 
    669 	if (s)
    670 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
    671 	pd++;
    672 	ps++;
    673 	w--;
    674     }
    675 
    676     while (w >= 4)
    677     {
    678 	__m128i src;
    679 	__m128i src_hi, src_lo, dst_hi, dst_lo;
    680 	__m128i alpha_hi, alpha_lo;
    681 
    682 	src = load_128_unaligned ((__m128i *)ps);
    683 
    684 	if (!is_zero (src))
    685 	{
    686 	    if (is_opaque (src))
    687 	    {
    688 		save_128_aligned ((__m128i *)pd, src);
    689 	    }
    690 	    else
    691 	    {
    692 		__m128i dst = load_128_aligned ((__m128i *)pd);
    693 
    694 		unpack_128_2x128 (src, &src_lo, &src_hi);
    695 		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
    696 
    697 		expand_alpha_2x128 (src_lo, src_hi,
    698 				    &alpha_lo, &alpha_hi);
    699 		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
    700 			    &dst_lo, &dst_hi);
    701 
    702 		save_128_aligned (
    703 		    (__m128i *)pd,
    704 		    pack_2x128_128 (dst_lo, dst_hi));
    705 	    }
    706 	}
    707 
    708 	ps += 4;
    709 	pd += 4;
    710 	w -= 4;
    711     }
    712     while (w)
    713     {
    714 	d = *pd;
    715 	s = *ps;
    716 
    717 	if (s)
    718 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
    719 	pd++;
    720 	ps++;
    721 
    722 	w--;
    723     }
    724 }
    725 
    726 static force_inline void
    727 sse2_combine_over_u (pixman_implementation_t *imp,
    728                      pixman_op_t              op,
    729                      uint32_t *               pd,
    730                      const uint32_t *         ps,
    731                      const uint32_t *         pm,
    732                      int                      w)
    733 {
    734     if (pm)
    735 	core_combine_over_u_sse2_mask (pd, ps, pm, w);
    736     else
    737 	core_combine_over_u_sse2_no_mask (pd, ps, w);
    738 }
    739 
    740 static void
    741 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
    742                              pixman_op_t              op,
    743                              uint32_t *               pd,
    744                              const uint32_t *         ps,
    745                              const uint32_t *         pm,
    746                              int                      w)
    747 {
    748     uint32_t s, d;
    749 
    750     __m128i xmm_dst_lo, xmm_dst_hi;
    751     __m128i xmm_src_lo, xmm_src_hi;
    752     __m128i xmm_alpha_lo, xmm_alpha_hi;
    753 
    754     /* Align dst on a 16-byte boundary */
    755     while (w &&
    756            ((uintptr_t)pd & 15))
    757     {
    758 	d = *pd;
    759 	s = combine1 (ps, pm);
    760 
    761 	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
    762 	w--;
    763 	ps++;
    764 	if (pm)
    765 	    pm++;
    766     }
    767 
    768     while (w >= 4)
    769     {
    770 	/* I'm loading unaligned because I'm not sure
    771 	 * about the address alignment.
    772 	 */
    773 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
    774 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    775 
    776 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    777 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    778 
    779 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    780 			    &xmm_alpha_lo, &xmm_alpha_hi);
    781 
    782 	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    783 		    &xmm_alpha_lo, &xmm_alpha_hi,
    784 		    &xmm_src_lo, &xmm_src_hi);
    785 
    786 	/* rebuid the 4 pixel data and save*/
    787 	save_128_aligned ((__m128i*)pd,
    788 			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
    789 
    790 	w -= 4;
    791 	ps += 4;
    792 	pd += 4;
    793 
    794 	if (pm)
    795 	    pm += 4;
    796     }
    797 
    798     while (w)
    799     {
    800 	d = *pd;
    801 	s = combine1 (ps, pm);
    802 
    803 	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
    804 	ps++;
    805 	w--;
    806 	if (pm)
    807 	    pm++;
    808     }
    809 }
    810 
    811 static force_inline uint32_t
    812 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
    813 {
    814     uint32_t maska = src >> 24;
    815 
    816     if (maska == 0)
    817     {
    818 	return 0;
    819     }
    820     else if (maska != 0xff)
    821     {
    822 	return pack_1x128_32 (
    823 	    pix_multiply_1x128 (unpack_32_1x128 (dst),
    824 				expand_alpha_1x128 (unpack_32_1x128 (src))));
    825     }
    826 
    827     return dst;
    828 }
    829 
    830 static void
    831 sse2_combine_in_u (pixman_implementation_t *imp,
    832                    pixman_op_t              op,
    833                    uint32_t *               pd,
    834                    const uint32_t *         ps,
    835                    const uint32_t *         pm,
    836                    int                      w)
    837 {
    838     uint32_t s, d;
    839 
    840     __m128i xmm_src_lo, xmm_src_hi;
    841     __m128i xmm_dst_lo, xmm_dst_hi;
    842 
    843     while (w && ((uintptr_t)pd & 15))
    844     {
    845 	s = combine1 (ps, pm);
    846 	d = *pd;
    847 
    848 	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
    849 	w--;
    850 	ps++;
    851 	if (pm)
    852 	    pm++;
    853     }
    854 
    855     while (w >= 4)
    856     {
    857 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    858 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
    859 
    860 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    861 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    862 
    863 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    864 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    865 			    &xmm_dst_lo, &xmm_dst_hi,
    866 			    &xmm_dst_lo, &xmm_dst_hi);
    867 
    868 	save_128_aligned ((__m128i*)pd,
    869 			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    870 
    871 	ps += 4;
    872 	pd += 4;
    873 	w -= 4;
    874 	if (pm)
    875 	    pm += 4;
    876     }
    877 
    878     while (w)
    879     {
    880 	s = combine1 (ps, pm);
    881 	d = *pd;
    882 
    883 	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
    884 	w--;
    885 	ps++;
    886 	if (pm)
    887 	    pm++;
    888     }
    889 }
    890 
    891 static void
    892 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
    893                            pixman_op_t              op,
    894                            uint32_t *               pd,
    895                            const uint32_t *         ps,
    896                            const uint32_t *         pm,
    897                            int                      w)
    898 {
    899     uint32_t s, d;
    900 
    901     __m128i xmm_src_lo, xmm_src_hi;
    902     __m128i xmm_dst_lo, xmm_dst_hi;
    903 
    904     while (w && ((uintptr_t)pd & 15))
    905     {
    906 	s = combine1 (ps, pm);
    907 	d = *pd;
    908 
    909 	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
    910 	ps++;
    911 	w--;
    912 	if (pm)
    913 	    pm++;
    914     }
    915 
    916     while (w >= 4)
    917     {
    918 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    919 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
    920 
    921 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    922 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    923 
    924 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    925 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    926 			    &xmm_src_lo, &xmm_src_hi,
    927 			    &xmm_dst_lo, &xmm_dst_hi);
    928 
    929 	save_128_aligned (
    930 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    931 
    932 	ps += 4;
    933 	pd += 4;
    934 	w -= 4;
    935 	if (pm)
    936 	    pm += 4;
    937     }
    938 
    939     while (w)
    940     {
    941 	s = combine1 (ps, pm);
    942 	d = *pd;
    943 
    944 	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
    945 	w--;
    946 	ps++;
    947 	if (pm)
    948 	    pm++;
    949     }
    950 }
    951 
    952 static void
    953 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
    954                             pixman_op_t              op,
    955                             uint32_t *               pd,
    956                             const uint32_t *         ps,
    957                             const uint32_t *         pm,
    958                             int                      w)
    959 {
    960     while (w && ((uintptr_t)pd & 15))
    961     {
    962 	uint32_t s = combine1 (ps, pm);
    963 	uint32_t d = *pd;
    964 
    965 	*pd++ = pack_1x128_32 (
    966 	    pix_multiply_1x128 (
    967 		unpack_32_1x128 (d), negate_1x128 (
    968 		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
    969 
    970 	if (pm)
    971 	    pm++;
    972 	ps++;
    973 	w--;
    974     }
    975 
    976     while (w >= 4)
    977     {
    978 	__m128i xmm_src_lo, xmm_src_hi;
    979 	__m128i xmm_dst_lo, xmm_dst_hi;
    980 
    981 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
    982 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    983 
    984 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    985 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    986 
    987 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    988 	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    989 
    990 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    991 			    &xmm_src_lo, &xmm_src_hi,
    992 			    &xmm_dst_lo, &xmm_dst_hi);
    993 
    994 	save_128_aligned (
    995 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    996 
    997 	ps += 4;
    998 	pd += 4;
    999 	if (pm)
   1000 	    pm += 4;
   1001 
   1002 	w -= 4;
   1003     }
   1004 
   1005     while (w)
   1006     {
   1007 	uint32_t s = combine1 (ps, pm);
   1008 	uint32_t d = *pd;
   1009 
   1010 	*pd++ = pack_1x128_32 (
   1011 	    pix_multiply_1x128 (
   1012 		unpack_32_1x128 (d), negate_1x128 (
   1013 		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
   1014 	ps++;
   1015 	if (pm)
   1016 	    pm++;
   1017 	w--;
   1018     }
   1019 }
   1020 
   1021 static void
   1022 sse2_combine_out_u (pixman_implementation_t *imp,
   1023                     pixman_op_t              op,
   1024                     uint32_t *               pd,
   1025                     const uint32_t *         ps,
   1026                     const uint32_t *         pm,
   1027                     int                      w)
   1028 {
   1029     while (w && ((uintptr_t)pd & 15))
   1030     {
   1031 	uint32_t s = combine1 (ps, pm);
   1032 	uint32_t d = *pd;
   1033 
   1034 	*pd++ = pack_1x128_32 (
   1035 	    pix_multiply_1x128 (
   1036 		unpack_32_1x128 (s), negate_1x128 (
   1037 		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
   1038 	w--;
   1039 	ps++;
   1040 	if (pm)
   1041 	    pm++;
   1042     }
   1043 
   1044     while (w >= 4)
   1045     {
   1046 	__m128i xmm_src_lo, xmm_src_hi;
   1047 	__m128i xmm_dst_lo, xmm_dst_hi;
   1048 
   1049 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
   1050 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   1051 
   1052 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1053 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1054 
   1055 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1056 	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1057 
   1058 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1059 			    &xmm_dst_lo, &xmm_dst_hi,
   1060 			    &xmm_dst_lo, &xmm_dst_hi);
   1061 
   1062 	save_128_aligned (
   1063 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1064 
   1065 	ps += 4;
   1066 	pd += 4;
   1067 	w -= 4;
   1068 	if (pm)
   1069 	    pm += 4;
   1070     }
   1071 
   1072     while (w)
   1073     {
   1074 	uint32_t s = combine1 (ps, pm);
   1075 	uint32_t d = *pd;
   1076 
   1077 	*pd++ = pack_1x128_32 (
   1078 	    pix_multiply_1x128 (
   1079 		unpack_32_1x128 (s), negate_1x128 (
   1080 		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
   1081 	w--;
   1082 	ps++;
   1083 	if (pm)
   1084 	    pm++;
   1085     }
   1086 }
   1087 
   1088 static force_inline uint32_t
   1089 core_combine_atop_u_pixel_sse2 (uint32_t src,
   1090                                 uint32_t dst)
   1091 {
   1092     __m128i s = unpack_32_1x128 (src);
   1093     __m128i d = unpack_32_1x128 (dst);
   1094 
   1095     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
   1096     __m128i da = expand_alpha_1x128 (d);
   1097 
   1098     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
   1099 }
   1100 
   1101 static void
   1102 sse2_combine_atop_u (pixman_implementation_t *imp,
   1103                      pixman_op_t              op,
   1104                      uint32_t *               pd,
   1105                      const uint32_t *         ps,
   1106                      const uint32_t *         pm,
   1107                      int                      w)
   1108 {
   1109     uint32_t s, d;
   1110 
   1111     __m128i xmm_src_lo, xmm_src_hi;
   1112     __m128i xmm_dst_lo, xmm_dst_hi;
   1113     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   1114     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   1115 
   1116     while (w && ((uintptr_t)pd & 15))
   1117     {
   1118 	s = combine1 (ps, pm);
   1119 	d = *pd;
   1120 
   1121 	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
   1122 	w--;
   1123 	ps++;
   1124 	if (pm)
   1125 	    pm++;
   1126     }
   1127 
   1128     while (w >= 4)
   1129     {
   1130 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
   1131 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   1132 
   1133 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1134 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1135 
   1136 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1137 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   1138 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1139 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   1140 
   1141 	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
   1142 		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   1143 
   1144 	pix_add_multiply_2x128 (
   1145 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   1146 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   1147 	    &xmm_dst_lo, &xmm_dst_hi);
   1148 
   1149 	save_128_aligned (
   1150 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1151 
   1152 	ps += 4;
   1153 	pd += 4;
   1154 	w -= 4;
   1155 	if (pm)
   1156 	    pm += 4;
   1157     }
   1158 
   1159     while (w)
   1160     {
   1161 	s = combine1 (ps, pm);
   1162 	d = *pd;
   1163 
   1164 	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
   1165 	w--;
   1166 	ps++;
   1167 	if (pm)
   1168 	    pm++;
   1169     }
   1170 }
   1171 
   1172 static force_inline uint32_t
   1173 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
   1174                                         uint32_t dst)
   1175 {
   1176     __m128i s = unpack_32_1x128 (src);
   1177     __m128i d = unpack_32_1x128 (dst);
   1178 
   1179     __m128i sa = expand_alpha_1x128 (s);
   1180     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
   1181 
   1182     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
   1183 }
   1184 
   1185 static void
   1186 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
   1187                              pixman_op_t              op,
   1188                              uint32_t *               pd,
   1189                              const uint32_t *         ps,
   1190                              const uint32_t *         pm,
   1191                              int                      w)
   1192 {
   1193     uint32_t s, d;
   1194 
   1195     __m128i xmm_src_lo, xmm_src_hi;
   1196     __m128i xmm_dst_lo, xmm_dst_hi;
   1197     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   1198     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   1199 
   1200     while (w && ((uintptr_t)pd & 15))
   1201     {
   1202 	s = combine1 (ps, pm);
   1203 	d = *pd;
   1204 
   1205 	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
   1206 	ps++;
   1207 	w--;
   1208 	if (pm)
   1209 	    pm++;
   1210     }
   1211 
   1212     while (w >= 4)
   1213     {
   1214 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
   1215 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   1216 
   1217 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1218 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1219 
   1220 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1221 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   1222 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1223 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   1224 
   1225 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
   1226 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   1227 
   1228 	pix_add_multiply_2x128 (
   1229 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   1230 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   1231 	    &xmm_dst_lo, &xmm_dst_hi);
   1232 
   1233 	save_128_aligned (
   1234 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1235 
   1236 	ps += 4;
   1237 	pd += 4;
   1238 	w -= 4;
   1239 	if (pm)
   1240 	    pm += 4;
   1241     }
   1242 
   1243     while (w)
   1244     {
   1245 	s = combine1 (ps, pm);
   1246 	d = *pd;
   1247 
   1248 	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
   1249 	ps++;
   1250 	w--;
   1251 	if (pm)
   1252 	    pm++;
   1253     }
   1254 }
   1255 
   1256 static force_inline uint32_t
   1257 core_combine_xor_u_pixel_sse2 (uint32_t src,
   1258                                uint32_t dst)
   1259 {
   1260     __m128i s = unpack_32_1x128 (src);
   1261     __m128i d = unpack_32_1x128 (dst);
   1262 
   1263     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
   1264     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
   1265 
   1266     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
   1267 }
   1268 
   1269 static void
   1270 sse2_combine_xor_u (pixman_implementation_t *imp,
   1271                     pixman_op_t              op,
   1272                     uint32_t *               dst,
   1273                     const uint32_t *         src,
   1274                     const uint32_t *         mask,
   1275                     int                      width)
   1276 {
   1277     int w = width;
   1278     uint32_t s, d;
   1279     uint32_t* pd = dst;
   1280     const uint32_t* ps = src;
   1281     const uint32_t* pm = mask;
   1282 
   1283     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   1284     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   1285     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   1286     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   1287 
   1288     while (w && ((uintptr_t)pd & 15))
   1289     {
   1290 	s = combine1 (ps, pm);
   1291 	d = *pd;
   1292 
   1293 	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
   1294 	w--;
   1295 	ps++;
   1296 	if (pm)
   1297 	    pm++;
   1298     }
   1299 
   1300     while (w >= 4)
   1301     {
   1302 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
   1303 	xmm_dst = load_128_aligned ((__m128i*) pd);
   1304 
   1305 	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   1306 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   1307 
   1308 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1309 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   1310 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1311 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   1312 
   1313 	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
   1314 		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   1315 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
   1316 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   1317 
   1318 	pix_add_multiply_2x128 (
   1319 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   1320 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   1321 	    &xmm_dst_lo, &xmm_dst_hi);
   1322 
   1323 	save_128_aligned (
   1324 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1325 
   1326 	ps += 4;
   1327 	pd += 4;
   1328 	w -= 4;
   1329 	if (pm)
   1330 	    pm += 4;
   1331     }
   1332 
   1333     while (w)
   1334     {
   1335 	s = combine1 (ps, pm);
   1336 	d = *pd;
   1337 
   1338 	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
   1339 	w--;
   1340 	ps++;
   1341 	if (pm)
   1342 	    pm++;
   1343     }
   1344 }
   1345 
   1346 static force_inline void
   1347 sse2_combine_add_u (pixman_implementation_t *imp,
   1348                     pixman_op_t              op,
   1349                     uint32_t *               dst,
   1350                     const uint32_t *         src,
   1351                     const uint32_t *         mask,
   1352                     int                      width)
   1353 {
   1354     int w = width;
   1355     uint32_t s, d;
   1356     uint32_t* pd = dst;
   1357     const uint32_t* ps = src;
   1358     const uint32_t* pm = mask;
   1359 
   1360     while (w && (uintptr_t)pd & 15)
   1361     {
   1362 	s = combine1 (ps, pm);
   1363 	d = *pd;
   1364 
   1365 	ps++;
   1366 	if (pm)
   1367 	    pm++;
   1368 	*pd++ = _mm_cvtsi128_si32 (
   1369 	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
   1370 	w--;
   1371     }
   1372 
   1373     while (w >= 4)
   1374     {
   1375 	__m128i s;
   1376 
   1377 	s = combine4 ((__m128i*)ps, (__m128i*)pm);
   1378 
   1379 	save_128_aligned (
   1380 	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
   1381 
   1382 	pd += 4;
   1383 	ps += 4;
   1384 	if (pm)
   1385 	    pm += 4;
   1386 	w -= 4;
   1387     }
   1388 
   1389     while (w--)
   1390     {
   1391 	s = combine1 (ps, pm);
   1392 	d = *pd;
   1393 
   1394 	ps++;
   1395 	*pd++ = _mm_cvtsi128_si32 (
   1396 	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
   1397 	if (pm)
   1398 	    pm++;
   1399     }
   1400 }
   1401 
   1402 static force_inline uint32_t
   1403 core_combine_saturate_u_pixel_sse2 (uint32_t src,
   1404                                     uint32_t dst)
   1405 {
   1406     __m128i ms = unpack_32_1x128 (src);
   1407     __m128i md = unpack_32_1x128 (dst);
   1408     uint32_t sa = src >> 24;
   1409     uint32_t da = ~dst >> 24;
   1410 
   1411     if (sa > da)
   1412     {
   1413 	ms = pix_multiply_1x128 (
   1414 	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
   1415     }
   1416 
   1417     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
   1418 }
   1419 
   1420 static void
   1421 sse2_combine_saturate_u (pixman_implementation_t *imp,
   1422                          pixman_op_t              op,
   1423                          uint32_t *               pd,
   1424                          const uint32_t *         ps,
   1425                          const uint32_t *         pm,
   1426                          int                      w)
   1427 {
   1428     uint32_t s, d;
   1429 
   1430     uint32_t pack_cmp;
   1431     __m128i xmm_src, xmm_dst;
   1432 
   1433     while (w && (uintptr_t)pd & 15)
   1434     {
   1435 	s = combine1 (ps, pm);
   1436 	d = *pd;
   1437 
   1438 	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1439 	w--;
   1440 	ps++;
   1441 	if (pm)
   1442 	    pm++;
   1443     }
   1444 
   1445     while (w >= 4)
   1446     {
   1447 	xmm_dst = load_128_aligned  ((__m128i*)pd);
   1448 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
   1449 
   1450 	pack_cmp = _mm_movemask_epi8 (
   1451 	    _mm_cmpgt_epi32 (
   1452 		_mm_srli_epi32 (xmm_src, 24),
   1453 		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
   1454 
   1455 	/* if some alpha src is grater than respective ~alpha dst */
   1456 	if (pack_cmp)
   1457 	{
   1458 	    s = combine1 (ps++, pm);
   1459 	    d = *pd;
   1460 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1461 	    if (pm)
   1462 		pm++;
   1463 
   1464 	    s = combine1 (ps++, pm);
   1465 	    d = *pd;
   1466 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1467 	    if (pm)
   1468 		pm++;
   1469 
   1470 	    s = combine1 (ps++, pm);
   1471 	    d = *pd;
   1472 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1473 	    if (pm)
   1474 		pm++;
   1475 
   1476 	    s = combine1 (ps++, pm);
   1477 	    d = *pd;
   1478 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1479 	    if (pm)
   1480 		pm++;
   1481 	}
   1482 	else
   1483 	{
   1484 	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
   1485 
   1486 	    pd += 4;
   1487 	    ps += 4;
   1488 	    if (pm)
   1489 		pm += 4;
   1490 	}
   1491 
   1492 	w -= 4;
   1493     }
   1494 
   1495     while (w--)
   1496     {
   1497 	s = combine1 (ps, pm);
   1498 	d = *pd;
   1499 
   1500 	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1501 	ps++;
   1502 	if (pm)
   1503 	    pm++;
   1504     }
   1505 }
   1506 
   1507 static void
   1508 sse2_combine_src_ca (pixman_implementation_t *imp,
   1509                      pixman_op_t              op,
   1510                      uint32_t *               pd,
   1511                      const uint32_t *         ps,
   1512                      const uint32_t *         pm,
   1513                      int                      w)
   1514 {
   1515     uint32_t s, m;
   1516 
   1517     __m128i xmm_src_lo, xmm_src_hi;
   1518     __m128i xmm_mask_lo, xmm_mask_hi;
   1519     __m128i xmm_dst_lo, xmm_dst_hi;
   1520 
   1521     while (w && (uintptr_t)pd & 15)
   1522     {
   1523 	s = *ps++;
   1524 	m = *pm++;
   1525 	*pd++ = pack_1x128_32 (
   1526 	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
   1527 	w--;
   1528     }
   1529 
   1530     while (w >= 4)
   1531     {
   1532 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1533 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1534 
   1535 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1536 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1537 
   1538 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1539 			    &xmm_mask_lo, &xmm_mask_hi,
   1540 			    &xmm_dst_lo, &xmm_dst_hi);
   1541 
   1542 	save_128_aligned (
   1543 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1544 
   1545 	ps += 4;
   1546 	pd += 4;
   1547 	pm += 4;
   1548 	w -= 4;
   1549     }
   1550 
   1551     while (w)
   1552     {
   1553 	s = *ps++;
   1554 	m = *pm++;
   1555 	*pd++ = pack_1x128_32 (
   1556 	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
   1557 	w--;
   1558     }
   1559 }
   1560 
   1561 static force_inline uint32_t
   1562 core_combine_over_ca_pixel_sse2 (uint32_t src,
   1563                                  uint32_t mask,
   1564                                  uint32_t dst)
   1565 {
   1566     __m128i s = unpack_32_1x128 (src);
   1567     __m128i expAlpha = expand_alpha_1x128 (s);
   1568     __m128i unpk_mask = unpack_32_1x128 (mask);
   1569     __m128i unpk_dst  = unpack_32_1x128 (dst);
   1570 
   1571     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
   1572 }
   1573 
   1574 static void
   1575 sse2_combine_over_ca (pixman_implementation_t *imp,
   1576                       pixman_op_t              op,
   1577                       uint32_t *               pd,
   1578                       const uint32_t *         ps,
   1579                       const uint32_t *         pm,
   1580                       int                      w)
   1581 {
   1582     uint32_t s, m, d;
   1583 
   1584     __m128i xmm_alpha_lo, xmm_alpha_hi;
   1585     __m128i xmm_src_lo, xmm_src_hi;
   1586     __m128i xmm_dst_lo, xmm_dst_hi;
   1587     __m128i xmm_mask_lo, xmm_mask_hi;
   1588 
   1589     while (w && (uintptr_t)pd & 15)
   1590     {
   1591 	s = *ps++;
   1592 	m = *pm++;
   1593 	d = *pd;
   1594 
   1595 	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
   1596 	w--;
   1597     }
   1598 
   1599     while (w >= 4)
   1600     {
   1601 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1602 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1603 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1604 
   1605 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1606 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1607 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1608 
   1609 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1610 			    &xmm_alpha_lo, &xmm_alpha_hi);
   1611 
   1612 	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   1613 		       &xmm_alpha_lo, &xmm_alpha_hi,
   1614 		       &xmm_mask_lo, &xmm_mask_hi,
   1615 		       &xmm_dst_lo, &xmm_dst_hi);
   1616 
   1617 	save_128_aligned (
   1618 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1619 
   1620 	ps += 4;
   1621 	pd += 4;
   1622 	pm += 4;
   1623 	w -= 4;
   1624     }
   1625 
   1626     while (w)
   1627     {
   1628 	s = *ps++;
   1629 	m = *pm++;
   1630 	d = *pd;
   1631 
   1632 	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
   1633 	w--;
   1634     }
   1635 }
   1636 
   1637 static force_inline uint32_t
   1638 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
   1639                                          uint32_t mask,
   1640                                          uint32_t dst)
   1641 {
   1642     __m128i d = unpack_32_1x128 (dst);
   1643 
   1644     return pack_1x128_32 (
   1645 	over_1x128 (d, expand_alpha_1x128 (d),
   1646 		    pix_multiply_1x128 (unpack_32_1x128 (src),
   1647 					unpack_32_1x128 (mask))));
   1648 }
   1649 
   1650 static void
   1651 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
   1652                               pixman_op_t              op,
   1653                               uint32_t *               pd,
   1654                               const uint32_t *         ps,
   1655                               const uint32_t *         pm,
   1656                               int                      w)
   1657 {
   1658     uint32_t s, m, d;
   1659 
   1660     __m128i xmm_alpha_lo, xmm_alpha_hi;
   1661     __m128i xmm_src_lo, xmm_src_hi;
   1662     __m128i xmm_dst_lo, xmm_dst_hi;
   1663     __m128i xmm_mask_lo, xmm_mask_hi;
   1664 
   1665     while (w && (uintptr_t)pd & 15)
   1666     {
   1667 	s = *ps++;
   1668 	m = *pm++;
   1669 	d = *pd;
   1670 
   1671 	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
   1672 	w--;
   1673     }
   1674 
   1675     while (w >= 4)
   1676     {
   1677 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1678 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1679 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1680 
   1681 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1682 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1683 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1684 
   1685 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1686 			    &xmm_alpha_lo, &xmm_alpha_hi);
   1687 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1688 			    &xmm_mask_lo, &xmm_mask_hi,
   1689 			    &xmm_mask_lo, &xmm_mask_hi);
   1690 
   1691 	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1692 		    &xmm_alpha_lo, &xmm_alpha_hi,
   1693 		    &xmm_mask_lo, &xmm_mask_hi);
   1694 
   1695 	save_128_aligned (
   1696 	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
   1697 
   1698 	ps += 4;
   1699 	pd += 4;
   1700 	pm += 4;
   1701 	w -= 4;
   1702     }
   1703 
   1704     while (w)
   1705     {
   1706 	s = *ps++;
   1707 	m = *pm++;
   1708 	d = *pd;
   1709 
   1710 	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
   1711 	w--;
   1712     }
   1713 }
   1714 
   1715 static void
   1716 sse2_combine_in_ca (pixman_implementation_t *imp,
   1717                     pixman_op_t              op,
   1718                     uint32_t *               pd,
   1719                     const uint32_t *         ps,
   1720                     const uint32_t *         pm,
   1721                     int                      w)
   1722 {
   1723     uint32_t s, m, d;
   1724 
   1725     __m128i xmm_alpha_lo, xmm_alpha_hi;
   1726     __m128i xmm_src_lo, xmm_src_hi;
   1727     __m128i xmm_dst_lo, xmm_dst_hi;
   1728     __m128i xmm_mask_lo, xmm_mask_hi;
   1729 
   1730     while (w && (uintptr_t)pd & 15)
   1731     {
   1732 	s = *ps++;
   1733 	m = *pm++;
   1734 	d = *pd;
   1735 
   1736 	*pd++ = pack_1x128_32 (
   1737 	    pix_multiply_1x128 (
   1738 		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
   1739 		expand_alpha_1x128 (unpack_32_1x128 (d))));
   1740 
   1741 	w--;
   1742     }
   1743 
   1744     while (w >= 4)
   1745     {
   1746 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1747 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1748 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1749 
   1750 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1751 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1752 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1753 
   1754 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1755 			    &xmm_alpha_lo, &xmm_alpha_hi);
   1756 
   1757 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1758 			    &xmm_mask_lo, &xmm_mask_hi,
   1759 			    &xmm_dst_lo, &xmm_dst_hi);
   1760 
   1761 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1762 			    &xmm_alpha_lo, &xmm_alpha_hi,
   1763 			    &xmm_dst_lo, &xmm_dst_hi);
   1764 
   1765 	save_128_aligned (
   1766 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1767 
   1768 	ps += 4;
   1769 	pd += 4;
   1770 	pm += 4;
   1771 	w -= 4;
   1772     }
   1773 
   1774     while (w)
   1775     {
   1776 	s = *ps++;
   1777 	m = *pm++;
   1778 	d = *pd;
   1779 
   1780 	*pd++ = pack_1x128_32 (
   1781 	    pix_multiply_1x128 (
   1782 		pix_multiply_1x128 (
   1783 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
   1784 		expand_alpha_1x128 (unpack_32_1x128 (d))));
   1785 
   1786 	w--;
   1787     }
   1788 }
   1789 
   1790 static void
   1791 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
   1792                             pixman_op_t              op,
   1793                             uint32_t *               pd,
   1794                             const uint32_t *         ps,
   1795                             const uint32_t *         pm,
   1796                             int                      w)
   1797 {
   1798     uint32_t s, m, d;
   1799 
   1800     __m128i xmm_alpha_lo, xmm_alpha_hi;
   1801     __m128i xmm_src_lo, xmm_src_hi;
   1802     __m128i xmm_dst_lo, xmm_dst_hi;
   1803     __m128i xmm_mask_lo, xmm_mask_hi;
   1804 
   1805     while (w && (uintptr_t)pd & 15)
   1806     {
   1807 	s = *ps++;
   1808 	m = *pm++;
   1809 	d = *pd;
   1810 
   1811 	*pd++ = pack_1x128_32 (
   1812 	    pix_multiply_1x128 (
   1813 		unpack_32_1x128 (d),
   1814 		pix_multiply_1x128 (unpack_32_1x128 (m),
   1815 				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
   1816 	w--;
   1817     }
   1818 
   1819     while (w >= 4)
   1820     {
   1821 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1822 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1823 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1824 
   1825 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1826 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1827 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1828 
   1829 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1830 			    &xmm_alpha_lo, &xmm_alpha_hi);
   1831 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   1832 			    &xmm_alpha_lo, &xmm_alpha_hi,
   1833 			    &xmm_alpha_lo, &xmm_alpha_hi);
   1834 
   1835 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1836 			    &xmm_alpha_lo, &xmm_alpha_hi,
   1837 			    &xmm_dst_lo, &xmm_dst_hi);
   1838 
   1839 	save_128_aligned (
   1840 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1841 
   1842 	ps += 4;
   1843 	pd += 4;
   1844 	pm += 4;
   1845 	w -= 4;
   1846     }
   1847 
   1848     while (w)
   1849     {
   1850 	s = *ps++;
   1851 	m = *pm++;
   1852 	d = *pd;
   1853 
   1854 	*pd++ = pack_1x128_32 (
   1855 	    pix_multiply_1x128 (
   1856 		unpack_32_1x128 (d),
   1857 		pix_multiply_1x128 (unpack_32_1x128 (m),
   1858 				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
   1859 	w--;
   1860     }
   1861 }
   1862 
   1863 static void
   1864 sse2_combine_out_ca (pixman_implementation_t *imp,
   1865                      pixman_op_t              op,
   1866                      uint32_t *               pd,
   1867                      const uint32_t *         ps,
   1868                      const uint32_t *         pm,
   1869                      int                      w)
   1870 {
   1871     uint32_t s, m, d;
   1872 
   1873     __m128i xmm_alpha_lo, xmm_alpha_hi;
   1874     __m128i xmm_src_lo, xmm_src_hi;
   1875     __m128i xmm_dst_lo, xmm_dst_hi;
   1876     __m128i xmm_mask_lo, xmm_mask_hi;
   1877 
   1878     while (w && (uintptr_t)pd & 15)
   1879     {
   1880 	s = *ps++;
   1881 	m = *pm++;
   1882 	d = *pd;
   1883 
   1884 	*pd++ = pack_1x128_32 (
   1885 	    pix_multiply_1x128 (
   1886 		pix_multiply_1x128 (
   1887 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
   1888 		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
   1889 	w--;
   1890     }
   1891 
   1892     while (w >= 4)
   1893     {
   1894 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1895 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1896 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1897 
   1898 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1899 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1900 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1901 
   1902 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1903 			    &xmm_alpha_lo, &xmm_alpha_hi);
   1904 	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
   1905 		      &xmm_alpha_lo, &xmm_alpha_hi);
   1906 
   1907 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1908 			    &xmm_mask_lo, &xmm_mask_hi,
   1909 			    &xmm_dst_lo, &xmm_dst_hi);
   1910 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1911 			    &xmm_alpha_lo, &xmm_alpha_hi,
   1912 			    &xmm_dst_lo, &xmm_dst_hi);
   1913 
   1914 	save_128_aligned (
   1915 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1916 
   1917 	ps += 4;
   1918 	pd += 4;
   1919 	pm += 4;
   1920 	w -= 4;
   1921     }
   1922 
   1923     while (w)
   1924     {
   1925 	s = *ps++;
   1926 	m = *pm++;
   1927 	d = *pd;
   1928 
   1929 	*pd++ = pack_1x128_32 (
   1930 	    pix_multiply_1x128 (
   1931 		pix_multiply_1x128 (
   1932 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
   1933 		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
   1934 
   1935 	w--;
   1936     }
   1937 }
   1938 
   1939 static void
   1940 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
   1941                              pixman_op_t              op,
   1942                              uint32_t *               pd,
   1943                              const uint32_t *         ps,
   1944                              const uint32_t *         pm,
   1945                              int                      w)
   1946 {
   1947     uint32_t s, m, d;
   1948 
   1949     __m128i xmm_alpha_lo, xmm_alpha_hi;
   1950     __m128i xmm_src_lo, xmm_src_hi;
   1951     __m128i xmm_dst_lo, xmm_dst_hi;
   1952     __m128i xmm_mask_lo, xmm_mask_hi;
   1953 
   1954     while (w && (uintptr_t)pd & 15)
   1955     {
   1956 	s = *ps++;
   1957 	m = *pm++;
   1958 	d = *pd;
   1959 
   1960 	*pd++ = pack_1x128_32 (
   1961 	    pix_multiply_1x128 (
   1962 		unpack_32_1x128 (d),
   1963 		negate_1x128 (pix_multiply_1x128 (
   1964 				 unpack_32_1x128 (m),
   1965 				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
   1966 	w--;
   1967     }
   1968 
   1969     while (w >= 4)
   1970     {
   1971 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1972 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1973 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1974 
   1975 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1976 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1977 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1978 
   1979 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1980 			    &xmm_alpha_lo, &xmm_alpha_hi);
   1981 
   1982 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   1983 			    &xmm_alpha_lo, &xmm_alpha_hi,
   1984 			    &xmm_mask_lo, &xmm_mask_hi);
   1985 
   1986 	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
   1987 		      &xmm_mask_lo, &xmm_mask_hi);
   1988 
   1989 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1990 			    &xmm_mask_lo, &xmm_mask_hi,
   1991 			    &xmm_dst_lo, &xmm_dst_hi);
   1992 
   1993 	save_128_aligned (
   1994 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1995 
   1996 	ps += 4;
   1997 	pd += 4;
   1998 	pm += 4;
   1999 	w -= 4;
   2000     }
   2001 
   2002     while (w)
   2003     {
   2004 	s = *ps++;
   2005 	m = *pm++;
   2006 	d = *pd;
   2007 
   2008 	*pd++ = pack_1x128_32 (
   2009 	    pix_multiply_1x128 (
   2010 		unpack_32_1x128 (d),
   2011 		negate_1x128 (pix_multiply_1x128 (
   2012 				 unpack_32_1x128 (m),
   2013 				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
   2014 	w--;
   2015     }
   2016 }
   2017 
   2018 static force_inline uint32_t
   2019 core_combine_atop_ca_pixel_sse2 (uint32_t src,
   2020                                  uint32_t mask,
   2021                                  uint32_t dst)
   2022 {
   2023     __m128i m = unpack_32_1x128 (mask);
   2024     __m128i s = unpack_32_1x128 (src);
   2025     __m128i d = unpack_32_1x128 (dst);
   2026     __m128i sa = expand_alpha_1x128 (s);
   2027     __m128i da = expand_alpha_1x128 (d);
   2028 
   2029     s = pix_multiply_1x128 (s, m);
   2030     m = negate_1x128 (pix_multiply_1x128 (m, sa));
   2031 
   2032     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
   2033 }
   2034 
   2035 static void
   2036 sse2_combine_atop_ca (pixman_implementation_t *imp,
   2037                       pixman_op_t              op,
   2038                       uint32_t *               pd,
   2039                       const uint32_t *         ps,
   2040                       const uint32_t *         pm,
   2041                       int                      w)
   2042 {
   2043     uint32_t s, m, d;
   2044 
   2045     __m128i xmm_src_lo, xmm_src_hi;
   2046     __m128i xmm_dst_lo, xmm_dst_hi;
   2047     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   2048     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   2049     __m128i xmm_mask_lo, xmm_mask_hi;
   2050 
   2051     while (w && (uintptr_t)pd & 15)
   2052     {
   2053 	s = *ps++;
   2054 	m = *pm++;
   2055 	d = *pd;
   2056 
   2057 	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
   2058 	w--;
   2059     }
   2060 
   2061     while (w >= 4)
   2062     {
   2063 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   2064 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   2065 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   2066 
   2067 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   2068 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   2069 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   2070 
   2071 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   2072 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   2073 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   2074 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   2075 
   2076 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   2077 			    &xmm_mask_lo, &xmm_mask_hi,
   2078 			    &xmm_src_lo, &xmm_src_hi);
   2079 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   2080 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   2081 			    &xmm_mask_lo, &xmm_mask_hi);
   2082 
   2083 	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   2084 
   2085 	pix_add_multiply_2x128 (
   2086 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
   2087 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   2088 	    &xmm_dst_lo, &xmm_dst_hi);
   2089 
   2090 	save_128_aligned (
   2091 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2092 
   2093 	ps += 4;
   2094 	pd += 4;
   2095 	pm += 4;
   2096 	w -= 4;
   2097     }
   2098 
   2099     while (w)
   2100     {
   2101 	s = *ps++;
   2102 	m = *pm++;
   2103 	d = *pd;
   2104 
   2105 	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
   2106 	w--;
   2107     }
   2108 }
   2109 
   2110 static force_inline uint32_t
   2111 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
   2112                                          uint32_t mask,
   2113                                          uint32_t dst)
   2114 {
   2115     __m128i m = unpack_32_1x128 (mask);
   2116     __m128i s = unpack_32_1x128 (src);
   2117     __m128i d = unpack_32_1x128 (dst);
   2118 
   2119     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
   2120     __m128i sa = expand_alpha_1x128 (s);
   2121 
   2122     s = pix_multiply_1x128 (s, m);
   2123     m = pix_multiply_1x128 (m, sa);
   2124 
   2125     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
   2126 }
   2127 
   2128 static void
   2129 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
   2130                               pixman_op_t              op,
   2131                               uint32_t *               pd,
   2132                               const uint32_t *         ps,
   2133                               const uint32_t *         pm,
   2134                               int                      w)
   2135 {
   2136     uint32_t s, m, d;
   2137 
   2138     __m128i xmm_src_lo, xmm_src_hi;
   2139     __m128i xmm_dst_lo, xmm_dst_hi;
   2140     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   2141     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   2142     __m128i xmm_mask_lo, xmm_mask_hi;
   2143 
   2144     while (w && (uintptr_t)pd & 15)
   2145     {
   2146 	s = *ps++;
   2147 	m = *pm++;
   2148 	d = *pd;
   2149 
   2150 	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
   2151 	w--;
   2152     }
   2153 
   2154     while (w >= 4)
   2155     {
   2156 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   2157 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   2158 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   2159 
   2160 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   2161 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   2162 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   2163 
   2164 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   2165 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   2166 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   2167 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   2168 
   2169 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   2170 			    &xmm_mask_lo, &xmm_mask_hi,
   2171 			    &xmm_src_lo, &xmm_src_hi);
   2172 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   2173 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   2174 			    &xmm_mask_lo, &xmm_mask_hi);
   2175 
   2176 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
   2177 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   2178 
   2179 	pix_add_multiply_2x128 (
   2180 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
   2181 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   2182 	    &xmm_dst_lo, &xmm_dst_hi);
   2183 
   2184 	save_128_aligned (
   2185 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2186 
   2187 	ps += 4;
   2188 	pd += 4;
   2189 	pm += 4;
   2190 	w -= 4;
   2191     }
   2192 
   2193     while (w)
   2194     {
   2195 	s = *ps++;
   2196 	m = *pm++;
   2197 	d = *pd;
   2198 
   2199 	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
   2200 	w--;
   2201     }
   2202 }
   2203 
   2204 static force_inline uint32_t
   2205 core_combine_xor_ca_pixel_sse2 (uint32_t src,
   2206                                 uint32_t mask,
   2207                                 uint32_t dst)
   2208 {
   2209     __m128i a = unpack_32_1x128 (mask);
   2210     __m128i s = unpack_32_1x128 (src);
   2211     __m128i d = unpack_32_1x128 (dst);
   2212 
   2213     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
   2214 				       a, expand_alpha_1x128 (s)));
   2215     __m128i dest      = pix_multiply_1x128 (s, a);
   2216     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
   2217 
   2218     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
   2219                                                 &alpha_dst,
   2220                                                 &dest,
   2221                                                 &alpha_src));
   2222 }
   2223 
   2224 static void
   2225 sse2_combine_xor_ca (pixman_implementation_t *imp,
   2226                      pixman_op_t              op,
   2227                      uint32_t *               pd,
   2228                      const uint32_t *         ps,
   2229                      const uint32_t *         pm,
   2230                      int                      w)
   2231 {
   2232     uint32_t s, m, d;
   2233 
   2234     __m128i xmm_src_lo, xmm_src_hi;
   2235     __m128i xmm_dst_lo, xmm_dst_hi;
   2236     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   2237     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   2238     __m128i xmm_mask_lo, xmm_mask_hi;
   2239 
   2240     while (w && (uintptr_t)pd & 15)
   2241     {
   2242 	s = *ps++;
   2243 	m = *pm++;
   2244 	d = *pd;
   2245 
   2246 	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
   2247 	w--;
   2248     }
   2249 
   2250     while (w >= 4)
   2251     {
   2252 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   2253 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   2254 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   2255 
   2256 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   2257 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   2258 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   2259 
   2260 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   2261 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   2262 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   2263 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   2264 
   2265 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   2266 			    &xmm_mask_lo, &xmm_mask_hi,
   2267 			    &xmm_src_lo, &xmm_src_hi);
   2268 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   2269 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   2270 			    &xmm_mask_lo, &xmm_mask_hi);
   2271 
   2272 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
   2273 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   2274 	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
   2275 		      &xmm_mask_lo, &xmm_mask_hi);
   2276 
   2277 	pix_add_multiply_2x128 (
   2278 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
   2279 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   2280 	    &xmm_dst_lo, &xmm_dst_hi);
   2281 
   2282 	save_128_aligned (
   2283 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2284 
   2285 	ps += 4;
   2286 	pd += 4;
   2287 	pm += 4;
   2288 	w -= 4;
   2289     }
   2290 
   2291     while (w)
   2292     {
   2293 	s = *ps++;
   2294 	m = *pm++;
   2295 	d = *pd;
   2296 
   2297 	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
   2298 	w--;
   2299     }
   2300 }
   2301 
   2302 static void
   2303 sse2_combine_add_ca (pixman_implementation_t *imp,
   2304                      pixman_op_t              op,
   2305                      uint32_t *               pd,
   2306                      const uint32_t *         ps,
   2307                      const uint32_t *         pm,
   2308                      int                      w)
   2309 {
   2310     uint32_t s, m, d;
   2311 
   2312     __m128i xmm_src_lo, xmm_src_hi;
   2313     __m128i xmm_dst_lo, xmm_dst_hi;
   2314     __m128i xmm_mask_lo, xmm_mask_hi;
   2315 
   2316     while (w && (uintptr_t)pd & 15)
   2317     {
   2318 	s = *ps++;
   2319 	m = *pm++;
   2320 	d = *pd;
   2321 
   2322 	*pd++ = pack_1x128_32 (
   2323 	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
   2324 					       unpack_32_1x128 (m)),
   2325 			   unpack_32_1x128 (d)));
   2326 	w--;
   2327     }
   2328 
   2329     while (w >= 4)
   2330     {
   2331 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   2332 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   2333 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   2334 
   2335 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   2336 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   2337 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   2338 
   2339 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   2340 			    &xmm_mask_lo, &xmm_mask_hi,
   2341 			    &xmm_src_lo, &xmm_src_hi);
   2342 
   2343 	save_128_aligned (
   2344 	    (__m128i*)pd, pack_2x128_128 (
   2345 		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
   2346 		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
   2347 
   2348 	ps += 4;
   2349 	pd += 4;
   2350 	pm += 4;
   2351 	w -= 4;
   2352     }
   2353 
   2354     while (w)
   2355     {
   2356 	s = *ps++;
   2357 	m = *pm++;
   2358 	d = *pd;
   2359 
   2360 	*pd++ = pack_1x128_32 (
   2361 	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
   2362 					       unpack_32_1x128 (m)),
   2363 			   unpack_32_1x128 (d)));
   2364 	w--;
   2365     }
   2366 }
   2367 
   2368 static force_inline __m128i
   2369 create_mask_16_128 (uint16_t mask)
   2370 {
   2371     return _mm_set1_epi16 (mask);
   2372 }
   2373 
   2374 /* Work around a code generation bug in Sun Studio 12. */
   2375 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
   2376 # define create_mask_2x32_128(mask0, mask1)				\
   2377     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
   2378 #else
   2379 static force_inline __m128i
   2380 create_mask_2x32_128 (uint32_t mask0,
   2381                       uint32_t mask1)
   2382 {
   2383     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
   2384 }
   2385 #endif
   2386 
   2387 static void
   2388 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
   2389                             pixman_composite_info_t *info)
   2390 {
   2391     PIXMAN_COMPOSITE_ARGS (info);
   2392     uint32_t src;
   2393     uint32_t    *dst_line, *dst, d;
   2394     int32_t w;
   2395     int dst_stride;
   2396     __m128i xmm_src, xmm_alpha;
   2397     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   2398 
   2399     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2400 
   2401     if (src == 0)
   2402 	return;
   2403 
   2404     PIXMAN_IMAGE_GET_LINE (
   2405 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2406 
   2407     xmm_src = expand_pixel_32_1x128 (src);
   2408     xmm_alpha = expand_alpha_1x128 (xmm_src);
   2409 
   2410     while (height--)
   2411     {
   2412 	dst = dst_line;
   2413 
   2414 	dst_line += dst_stride;
   2415 	w = width;
   2416 
   2417 	while (w && (uintptr_t)dst & 15)
   2418 	{
   2419 	    d = *dst;
   2420 	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
   2421 						xmm_alpha,
   2422 						unpack_32_1x128 (d)));
   2423 	    w--;
   2424 	}
   2425 
   2426 	while (w >= 4)
   2427 	{
   2428 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   2429 
   2430 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   2431 
   2432 	    over_2x128 (&xmm_src, &xmm_src,
   2433 			&xmm_alpha, &xmm_alpha,
   2434 			&xmm_dst_lo, &xmm_dst_hi);
   2435 
   2436 	    /* rebuid the 4 pixel data and save*/
   2437 	    save_128_aligned (
   2438 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2439 
   2440 	    w -= 4;
   2441 	    dst += 4;
   2442 	}
   2443 
   2444 	while (w)
   2445 	{
   2446 	    d = *dst;
   2447 	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
   2448 						xmm_alpha,
   2449 						unpack_32_1x128 (d)));
   2450 	    w--;
   2451 	}
   2452 
   2453     }
   2454 }
   2455 
   2456 static void
   2457 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
   2458                             pixman_composite_info_t *info)
   2459 {
   2460     PIXMAN_COMPOSITE_ARGS (info);
   2461     uint32_t src;
   2462     uint16_t    *dst_line, *dst, d;
   2463     int32_t w;
   2464     int dst_stride;
   2465     __m128i xmm_src, xmm_alpha;
   2466     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
   2467 
   2468     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2469 
   2470     if (src == 0)
   2471 	return;
   2472 
   2473     PIXMAN_IMAGE_GET_LINE (
   2474 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2475 
   2476     xmm_src = expand_pixel_32_1x128 (src);
   2477     xmm_alpha = expand_alpha_1x128 (xmm_src);
   2478 
   2479     while (height--)
   2480     {
   2481 	dst = dst_line;
   2482 
   2483 	dst_line += dst_stride;
   2484 	w = width;
   2485 
   2486 	while (w && (uintptr_t)dst & 15)
   2487 	{
   2488 	    d = *dst;
   2489 
   2490 	    *dst++ = pack_565_32_16 (
   2491 		pack_1x128_32 (over_1x128 (xmm_src,
   2492 					   xmm_alpha,
   2493 					   expand565_16_1x128 (d))));
   2494 	    w--;
   2495 	}
   2496 
   2497 	while (w >= 8)
   2498 	{
   2499 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   2500 
   2501 	    unpack_565_128_4x128 (xmm_dst,
   2502 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   2503 
   2504 	    over_2x128 (&xmm_src, &xmm_src,
   2505 			&xmm_alpha, &xmm_alpha,
   2506 			&xmm_dst0, &xmm_dst1);
   2507 	    over_2x128 (&xmm_src, &xmm_src,
   2508 			&xmm_alpha, &xmm_alpha,
   2509 			&xmm_dst2, &xmm_dst3);
   2510 
   2511 	    xmm_dst = pack_565_4x128_128 (
   2512 		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   2513 
   2514 	    save_128_aligned ((__m128i*)dst, xmm_dst);
   2515 
   2516 	    dst += 8;
   2517 	    w -= 8;
   2518 	}
   2519 
   2520 	while (w--)
   2521 	{
   2522 	    d = *dst;
   2523 	    *dst++ = pack_565_32_16 (
   2524 		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
   2525 					   expand565_16_1x128 (d))));
   2526 	}
   2527     }
   2528 
   2529 }
   2530 
   2531 static void
   2532 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
   2533 				   pixman_composite_info_t *info)
   2534 {
   2535     PIXMAN_COMPOSITE_ARGS (info);
   2536     uint32_t src;
   2537     uint32_t    *dst_line, d;
   2538     uint32_t    *mask_line, m;
   2539     uint32_t pack_cmp;
   2540     int dst_stride, mask_stride;
   2541 
   2542     __m128i xmm_src;
   2543     __m128i xmm_dst;
   2544     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   2545 
   2546     __m128i mmx_src, mmx_mask, mmx_dest;
   2547 
   2548     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2549 
   2550     if (src == 0)
   2551 	return;
   2552 
   2553     PIXMAN_IMAGE_GET_LINE (
   2554 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2555     PIXMAN_IMAGE_GET_LINE (
   2556 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   2557 
   2558     xmm_src = _mm_unpacklo_epi8 (
   2559 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
   2560     mmx_src   = xmm_src;
   2561 
   2562     while (height--)
   2563     {
   2564 	int w = width;
   2565 	const uint32_t *pm = (uint32_t *)mask_line;
   2566 	uint32_t *pd = (uint32_t *)dst_line;
   2567 
   2568 	dst_line += dst_stride;
   2569 	mask_line += mask_stride;
   2570 
   2571 	while (w && (uintptr_t)pd & 15)
   2572 	{
   2573 	    m = *pm++;
   2574 
   2575 	    if (m)
   2576 	    {
   2577 		d = *pd;
   2578 
   2579 		mmx_mask = unpack_32_1x128 (m);
   2580 		mmx_dest = unpack_32_1x128 (d);
   2581 
   2582 		*pd = pack_1x128_32 (
   2583 		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
   2584 				   mmx_dest));
   2585 	    }
   2586 
   2587 	    pd++;
   2588 	    w--;
   2589 	}
   2590 
   2591 	while (w >= 4)
   2592 	{
   2593 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
   2594 
   2595 	    pack_cmp =
   2596 		_mm_movemask_epi8 (
   2597 		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
   2598 
   2599 	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
   2600 	    if (pack_cmp != 0xffff)
   2601 	    {
   2602 		xmm_dst = load_128_aligned ((__m128i*)pd);
   2603 
   2604 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   2605 
   2606 		pix_multiply_2x128 (&xmm_src, &xmm_src,
   2607 				    &xmm_mask_lo, &xmm_mask_hi,
   2608 				    &xmm_mask_lo, &xmm_mask_hi);
   2609 		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
   2610 
   2611 		save_128_aligned (
   2612 		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
   2613 	    }
   2614 
   2615 	    pd += 4;
   2616 	    pm += 4;
   2617 	    w -= 4;
   2618 	}
   2619 
   2620 	while (w)
   2621 	{
   2622 	    m = *pm++;
   2623 
   2624 	    if (m)
   2625 	    {
   2626 		d = *pd;
   2627 
   2628 		mmx_mask = unpack_32_1x128 (m);
   2629 		mmx_dest = unpack_32_1x128 (d);
   2630 
   2631 		*pd = pack_1x128_32 (
   2632 		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
   2633 				   mmx_dest));
   2634 	    }
   2635 
   2636 	    pd++;
   2637 	    w--;
   2638 	}
   2639     }
   2640 
   2641 }
   2642 
   2643 static void
   2644 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
   2645                                     pixman_composite_info_t *info)
   2646 {
   2647     PIXMAN_COMPOSITE_ARGS (info);
   2648     uint32_t src;
   2649     uint32_t    *dst_line, d;
   2650     uint32_t    *mask_line, m;
   2651     uint32_t pack_cmp;
   2652     int dst_stride, mask_stride;
   2653 
   2654     __m128i xmm_src, xmm_alpha;
   2655     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   2656     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   2657 
   2658     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
   2659 
   2660     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2661 
   2662     if (src == 0)
   2663 	return;
   2664 
   2665     PIXMAN_IMAGE_GET_LINE (
   2666 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2667     PIXMAN_IMAGE_GET_LINE (
   2668 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   2669 
   2670     xmm_src = _mm_unpacklo_epi8 (
   2671 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
   2672     xmm_alpha = expand_alpha_1x128 (xmm_src);
   2673     mmx_src   = xmm_src;
   2674     mmx_alpha = xmm_alpha;
   2675 
   2676     while (height--)
   2677     {
   2678 	int w = width;
   2679 	const uint32_t *pm = (uint32_t *)mask_line;
   2680 	uint32_t *pd = (uint32_t *)dst_line;
   2681 
   2682 	dst_line += dst_stride;
   2683 	mask_line += mask_stride;
   2684 
   2685 	while (w && (uintptr_t)pd & 15)
   2686 	{
   2687 	    m = *pm++;
   2688 
   2689 	    if (m)
   2690 	    {
   2691 		d = *pd;
   2692 		mmx_mask = unpack_32_1x128 (m);
   2693 		mmx_dest = unpack_32_1x128 (d);
   2694 
   2695 		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
   2696 		                                  &mmx_alpha,
   2697 		                                  &mmx_mask,
   2698 		                                  &mmx_dest));
   2699 	    }
   2700 
   2701 	    pd++;
   2702 	    w--;
   2703 	}
   2704 
   2705 	while (w >= 4)
   2706 	{
   2707 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
   2708 
   2709 	    pack_cmp =
   2710 		_mm_movemask_epi8 (
   2711 		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
   2712 
   2713 	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
   2714 	    if (pack_cmp != 0xffff)
   2715 	    {
   2716 		xmm_dst = load_128_aligned ((__m128i*)pd);
   2717 
   2718 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   2719 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   2720 
   2721 		in_over_2x128 (&xmm_src, &xmm_src,
   2722 			       &xmm_alpha, &xmm_alpha,
   2723 			       &xmm_mask_lo, &xmm_mask_hi,
   2724 			       &xmm_dst_lo, &xmm_dst_hi);
   2725 
   2726 		save_128_aligned (
   2727 		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2728 	    }
   2729 
   2730 	    pd += 4;
   2731 	    pm += 4;
   2732 	    w -= 4;
   2733 	}
   2734 
   2735 	while (w)
   2736 	{
   2737 	    m = *pm++;
   2738 
   2739 	    if (m)
   2740 	    {
   2741 		d = *pd;
   2742 		mmx_mask = unpack_32_1x128 (m);
   2743 		mmx_dest = unpack_32_1x128 (d);
   2744 
   2745 		*pd = pack_1x128_32 (
   2746 		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
   2747 	    }
   2748 
   2749 	    pd++;
   2750 	    w--;
   2751 	}
   2752     }
   2753 
   2754 }
   2755 
   2756 static void
   2757 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
   2758                                  pixman_composite_info_t *info)
   2759 {
   2760     PIXMAN_COMPOSITE_ARGS (info);
   2761     uint32_t    *dst_line, *dst;
   2762     uint32_t    *src_line, *src;
   2763     uint32_t mask;
   2764     int32_t w;
   2765     int dst_stride, src_stride;
   2766 
   2767     __m128i xmm_mask;
   2768     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   2769     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   2770     __m128i xmm_alpha_lo, xmm_alpha_hi;
   2771 
   2772     PIXMAN_IMAGE_GET_LINE (
   2773 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2774     PIXMAN_IMAGE_GET_LINE (
   2775 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2776 
   2777     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
   2778 
   2779     xmm_mask = create_mask_16_128 (mask >> 24);
   2780 
   2781     while (height--)
   2782     {
   2783 	dst = dst_line;
   2784 	dst_line += dst_stride;
   2785 	src = src_line;
   2786 	src_line += src_stride;
   2787 	w = width;
   2788 
   2789 	while (w && (uintptr_t)dst & 15)
   2790 	{
   2791 	    uint32_t s = *src++;
   2792 
   2793 	    if (s)
   2794 	    {
   2795 		uint32_t d = *dst;
   2796 
   2797 		__m128i ms = unpack_32_1x128 (s);
   2798 		__m128i alpha    = expand_alpha_1x128 (ms);
   2799 		__m128i dest     = xmm_mask;
   2800 		__m128i alpha_dst = unpack_32_1x128 (d);
   2801 
   2802 		*dst = pack_1x128_32 (
   2803 		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
   2804 	    }
   2805 	    dst++;
   2806 	    w--;
   2807 	}
   2808 
   2809 	while (w >= 4)
   2810 	{
   2811 	    xmm_src = load_128_unaligned ((__m128i*)src);
   2812 
   2813 	    if (!is_zero (xmm_src))
   2814 	    {
   2815 		xmm_dst = load_128_aligned ((__m128i*)dst);
   2816 
   2817 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   2818 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   2819 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   2820 				    &xmm_alpha_lo, &xmm_alpha_hi);
   2821 
   2822 		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   2823 			       &xmm_alpha_lo, &xmm_alpha_hi,
   2824 			       &xmm_mask, &xmm_mask,
   2825 			       &xmm_dst_lo, &xmm_dst_hi);
   2826 
   2827 		save_128_aligned (
   2828 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2829 	    }
   2830 
   2831 	    dst += 4;
   2832 	    src += 4;
   2833 	    w -= 4;
   2834 	}
   2835 
   2836 	while (w)
   2837 	{
   2838 	    uint32_t s = *src++;
   2839 
   2840 	    if (s)
   2841 	    {
   2842 		uint32_t d = *dst;
   2843 
   2844 		__m128i ms = unpack_32_1x128 (s);
   2845 		__m128i alpha = expand_alpha_1x128 (ms);
   2846 		__m128i mask  = xmm_mask;
   2847 		__m128i dest  = unpack_32_1x128 (d);
   2848 
   2849 		*dst = pack_1x128_32 (
   2850 		    in_over_1x128 (&ms, &alpha, &mask, &dest));
   2851 	    }
   2852 
   2853 	    dst++;
   2854 	    w--;
   2855 	}
   2856     }
   2857 
   2858 }
   2859 
   2860 static void
   2861 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
   2862                               pixman_composite_info_t *info)
   2863 {
   2864     PIXMAN_COMPOSITE_ARGS (info);
   2865     uint16_t    *dst_line, *dst;
   2866     uint32_t    *src_line, *src, s;
   2867     int dst_stride, src_stride;
   2868     int32_t w;
   2869 
   2870     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2871     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2872 
   2873     while (height--)
   2874     {
   2875 	dst = dst_line;
   2876 	dst_line += dst_stride;
   2877 	src = src_line;
   2878 	src_line += src_stride;
   2879 	w = width;
   2880 
   2881 	while (w && (uintptr_t)dst & 15)
   2882 	{
   2883 	    s = *src++;
   2884 	    *dst = convert_8888_to_0565 (s);
   2885 	    dst++;
   2886 	    w--;
   2887 	}
   2888 
   2889 	while (w >= 8)
   2890 	{
   2891 	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
   2892 	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
   2893 
   2894 	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
   2895 
   2896 	    w -= 8;
   2897 	    src += 8;
   2898 	    dst += 8;
   2899 	}
   2900 
   2901 	while (w)
   2902 	{
   2903 	    s = *src++;
   2904 	    *dst = convert_8888_to_0565 (s);
   2905 	    dst++;
   2906 	    w--;
   2907 	}
   2908     }
   2909 }
   2910 
   2911 static void
   2912 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
   2913 			      pixman_composite_info_t *info)
   2914 {
   2915     PIXMAN_COMPOSITE_ARGS (info);
   2916     uint32_t    *dst_line, *dst;
   2917     uint32_t    *src_line, *src;
   2918     int32_t w;
   2919     int dst_stride, src_stride;
   2920 
   2921 
   2922     PIXMAN_IMAGE_GET_LINE (
   2923 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2924     PIXMAN_IMAGE_GET_LINE (
   2925 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2926 
   2927     while (height--)
   2928     {
   2929 	dst = dst_line;
   2930 	dst_line += dst_stride;
   2931 	src = src_line;
   2932 	src_line += src_stride;
   2933 	w = width;
   2934 
   2935 	while (w && (uintptr_t)dst & 15)
   2936 	{
   2937 	    *dst++ = *src++ | 0xff000000;
   2938 	    w--;
   2939 	}
   2940 
   2941 	while (w >= 16)
   2942 	{
   2943 	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
   2944 
   2945 	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
   2946 	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
   2947 	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
   2948 	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
   2949 
   2950 	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
   2951 	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
   2952 	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
   2953 	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
   2954 
   2955 	    dst += 16;
   2956 	    src += 16;
   2957 	    w -= 16;
   2958 	}
   2959 
   2960 	while (w)
   2961 	{
   2962 	    *dst++ = *src++ | 0xff000000;
   2963 	    w--;
   2964 	}
   2965     }
   2966 
   2967 }
   2968 
   2969 static void
   2970 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
   2971                                  pixman_composite_info_t *info)
   2972 {
   2973     PIXMAN_COMPOSITE_ARGS (info);
   2974     uint32_t    *dst_line, *dst;
   2975     uint32_t    *src_line, *src;
   2976     uint32_t mask;
   2977     int dst_stride, src_stride;
   2978     int32_t w;
   2979 
   2980     __m128i xmm_mask, xmm_alpha;
   2981     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   2982     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   2983 
   2984     PIXMAN_IMAGE_GET_LINE (
   2985 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2986     PIXMAN_IMAGE_GET_LINE (
   2987 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2988 
   2989     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
   2990 
   2991     xmm_mask = create_mask_16_128 (mask >> 24);
   2992     xmm_alpha = mask_00ff;
   2993 
   2994     while (height--)
   2995     {
   2996 	dst = dst_line;
   2997 	dst_line += dst_stride;
   2998 	src = src_line;
   2999 	src_line += src_stride;
   3000 	w = width;
   3001 
   3002 	while (w && (uintptr_t)dst & 15)
   3003 	{
   3004 	    uint32_t s = (*src++) | 0xff000000;
   3005 	    uint32_t d = *dst;
   3006 
   3007 	    __m128i src   = unpack_32_1x128 (s);
   3008 	    __m128i alpha = xmm_alpha;
   3009 	    __m128i mask  = xmm_mask;
   3010 	    __m128i dest  = unpack_32_1x128 (d);
   3011 
   3012 	    *dst++ = pack_1x128_32 (
   3013 		in_over_1x128 (&src, &alpha, &mask, &dest));
   3014 
   3015 	    w--;
   3016 	}
   3017 
   3018 	while (w >= 4)
   3019 	{
   3020 	    xmm_src = _mm_or_si128 (
   3021 		load_128_unaligned ((__m128i*)src), mask_ff000000);
   3022 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   3023 
   3024 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   3025 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   3026 
   3027 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   3028 			   &xmm_alpha, &xmm_alpha,
   3029 			   &xmm_mask, &xmm_mask,
   3030 			   &xmm_dst_lo, &xmm_dst_hi);
   3031 
   3032 	    save_128_aligned (
   3033 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   3034 
   3035 	    dst += 4;
   3036 	    src += 4;
   3037 	    w -= 4;
   3038 
   3039 	}
   3040 
   3041 	while (w)
   3042 	{
   3043 	    uint32_t s = (*src++) | 0xff000000;
   3044 	    uint32_t d = *dst;
   3045 
   3046 	    __m128i src  = unpack_32_1x128 (s);
   3047 	    __m128i alpha = xmm_alpha;
   3048 	    __m128i mask  = xmm_mask;
   3049 	    __m128i dest  = unpack_32_1x128 (d);
   3050 
   3051 	    *dst++ = pack_1x128_32 (
   3052 		in_over_1x128 (&src, &alpha, &mask, &dest));
   3053 
   3054 	    w--;
   3055 	}
   3056     }
   3057 
   3058 }
   3059 
   3060 static void
   3061 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
   3062                                pixman_composite_info_t *info)
   3063 {
   3064     PIXMAN_COMPOSITE_ARGS (info);
   3065     int dst_stride, src_stride;
   3066     uint32_t    *dst_line, *dst;
   3067     uint32_t    *src_line, *src;
   3068 
   3069     PIXMAN_IMAGE_GET_LINE (
   3070 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3071     PIXMAN_IMAGE_GET_LINE (
   3072 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3073 
   3074     dst = dst_line;
   3075     src = src_line;
   3076 
   3077     while (height--)
   3078     {
   3079 	sse2_combine_over_u (imp, op, dst, src, NULL, width);
   3080 
   3081 	dst += dst_stride;
   3082 	src += src_stride;
   3083     }
   3084 }
   3085 
   3086 static force_inline uint16_t
   3087 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
   3088 {
   3089     __m128i ms;
   3090 
   3091     ms = unpack_32_1x128 (src);
   3092     return pack_565_32_16 (
   3093 	pack_1x128_32 (
   3094 	    over_1x128 (
   3095 		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
   3096 }
   3097 
   3098 static void
   3099 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
   3100                                pixman_composite_info_t *info)
   3101 {
   3102     PIXMAN_COMPOSITE_ARGS (info);
   3103     uint16_t    *dst_line, *dst, d;
   3104     uint32_t    *src_line, *src, s;
   3105     int dst_stride, src_stride;
   3106     int32_t w;
   3107 
   3108     __m128i xmm_alpha_lo, xmm_alpha_hi;
   3109     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   3110     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
   3111 
   3112     PIXMAN_IMAGE_GET_LINE (
   3113 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   3114     PIXMAN_IMAGE_GET_LINE (
   3115 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3116 
   3117     while (height--)
   3118     {
   3119 	dst = dst_line;
   3120 	src = src_line;
   3121 
   3122 	dst_line += dst_stride;
   3123 	src_line += src_stride;
   3124 	w = width;
   3125 
   3126 	/* Align dst on a 16-byte boundary */
   3127 	while (w &&
   3128 	       ((uintptr_t)dst & 15))
   3129 	{
   3130 	    s = *src++;
   3131 	    d = *dst;
   3132 
   3133 	    *dst++ = composite_over_8888_0565pixel (s, d);
   3134 	    w--;
   3135 	}
   3136 
   3137 	/* It's a 8 pixel loop */
   3138 	while (w >= 8)
   3139 	{
   3140 	    /* I'm loading unaligned because I'm not sure
   3141 	     * about the address alignment.
   3142 	     */
   3143 	    xmm_src = load_128_unaligned ((__m128i*) src);
   3144 	    xmm_dst = load_128_aligned ((__m128i*) dst);
   3145 
   3146 	    /* Unpacking */
   3147 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   3148 	    unpack_565_128_4x128 (xmm_dst,
   3149 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   3150 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   3151 				&xmm_alpha_lo, &xmm_alpha_hi);
   3152 
   3153 	    /* I'm loading next 4 pixels from memory
   3154 	     * before to optimze the memory read.
   3155 	     */
   3156 	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
   3157 
   3158 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
   3159 			&xmm_alpha_lo, &xmm_alpha_hi,
   3160 			&xmm_dst0, &xmm_dst1);
   3161 
   3162 	    /* Unpacking */
   3163 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   3164 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   3165 				&xmm_alpha_lo, &xmm_alpha_hi);
   3166 
   3167 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
   3168 			&xmm_alpha_lo, &xmm_alpha_hi,
   3169 			&xmm_dst2, &xmm_dst3);
   3170 
   3171 	    save_128_aligned (
   3172 		(__m128i*)dst, pack_565_4x128_128 (
   3173 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
   3174 
   3175 	    w -= 8;
   3176 	    dst += 8;
   3177 	    src += 8;
   3178 	}
   3179 
   3180 	while (w--)
   3181 	{
   3182 	    s = *src++;
   3183 	    d = *dst;
   3184 
   3185 	    *dst++ = composite_over_8888_0565pixel (s, d);
   3186 	}
   3187     }
   3188 
   3189 }
   3190 
   3191 static void
   3192 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
   3193                               pixman_composite_info_t *info)
   3194 {
   3195     PIXMAN_COMPOSITE_ARGS (info);
   3196     uint32_t src, srca;
   3197     uint32_t *dst_line, *dst;
   3198     uint8_t *mask_line, *mask;
   3199     int dst_stride, mask_stride;
   3200     int32_t w;
   3201     uint32_t m, d;
   3202 
   3203     __m128i xmm_src, xmm_alpha, xmm_def;
   3204     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   3205     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   3206 
   3207     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
   3208 
   3209     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   3210 
   3211     srca = src >> 24;
   3212     if (src == 0)
   3213 	return;
   3214 
   3215     PIXMAN_IMAGE_GET_LINE (
   3216 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3217     PIXMAN_IMAGE_GET_LINE (
   3218 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   3219 
   3220     xmm_def = create_mask_2x32_128 (src, src);
   3221     xmm_src = expand_pixel_32_1x128 (src);
   3222     xmm_alpha = expand_alpha_1x128 (xmm_src);
   3223     mmx_src   = xmm_src;
   3224     mmx_alpha = xmm_alpha;
   3225 
   3226     while (height--)
   3227     {
   3228 	dst = dst_line;
   3229 	dst_line += dst_stride;
   3230 	mask = mask_line;
   3231 	mask_line += mask_stride;
   3232 	w = width;
   3233 
   3234 	while (w && (uintptr_t)dst & 15)
   3235 	{
   3236 	    uint8_t m = *mask++;
   3237 
   3238 	    if (m)
   3239 	    {
   3240 		d = *dst;
   3241 		mmx_mask = expand_pixel_8_1x128 (m);
   3242 		mmx_dest = unpack_32_1x128 (d);
   3243 
   3244 		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
   3245 		                                   &mmx_alpha,
   3246 		                                   &mmx_mask,
   3247 		                                   &mmx_dest));
   3248 	    }
   3249 
   3250 	    w--;
   3251 	    dst++;
   3252 	}
   3253 
   3254 	while (w >= 4)
   3255 	{
   3256 	    m = *((uint32_t*)mask);
   3257 
   3258 	    if (srca == 0xff && m == 0xffffffff)
   3259 	    {
   3260 		save_128_aligned ((__m128i*)dst, xmm_def);
   3261 	    }
   3262 	    else if (m)
   3263 	    {
   3264 		xmm_dst = load_128_aligned ((__m128i*) dst);
   3265 		xmm_mask = unpack_32_1x128 (m);
   3266 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
   3267 
   3268 		/* Unpacking */
   3269 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   3270 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3271 
   3272 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
   3273 					&xmm_mask_lo, &xmm_mask_hi);
   3274 
   3275 		in_over_2x128 (&xmm_src, &xmm_src,
   3276 			       &xmm_alpha, &xmm_alpha,
   3277 			       &xmm_mask_lo, &xmm_mask_hi,
   3278 			       &xmm_dst_lo, &xmm_dst_hi);
   3279 
   3280 		save_128_aligned (
   3281 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   3282 	    }
   3283 
   3284 	    w -= 4;
   3285 	    dst += 4;
   3286 	    mask += 4;
   3287 	}
   3288 
   3289 	while (w)
   3290 	{
   3291 	    uint8_t m = *mask++;
   3292 
   3293 	    if (m)
   3294 	    {
   3295 		d = *dst;
   3296 		mmx_mask = expand_pixel_8_1x128 (m);
   3297 		mmx_dest = unpack_32_1x128 (d);
   3298 
   3299 		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
   3300 		                                   &mmx_alpha,
   3301 		                                   &mmx_mask,
   3302 		                                   &mmx_dest));
   3303 	    }
   3304 
   3305 	    w--;
   3306 	    dst++;
   3307 	}
   3308     }
   3309 
   3310 }
   3311 
   3312 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
   3313 __attribute__((__force_align_arg_pointer__))
   3314 #endif
   3315 static pixman_bool_t
   3316 sse2_fill (pixman_implementation_t *imp,
   3317            uint32_t *               bits,
   3318            int                      stride,
   3319            int                      bpp,
   3320            int                      x,
   3321            int                      y,
   3322            int                      width,
   3323            int                      height,
   3324            uint32_t		    filler)
   3325 {
   3326     uint32_t byte_width;
   3327     uint8_t *byte_line;
   3328 
   3329     __m128i xmm_def;
   3330 
   3331     if (bpp == 8)
   3332     {
   3333 	uint8_t b;
   3334 	uint16_t w;
   3335 
   3336 	stride = stride * (int) sizeof (uint32_t) / 1;
   3337 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
   3338 	byte_width = width;
   3339 	stride *= 1;
   3340 
   3341 	b = filler & 0xff;
   3342 	w = (b << 8) | b;
   3343 	filler = (w << 16) | w;
   3344     }
   3345     else if (bpp == 16)
   3346     {
   3347 	stride = stride * (int) sizeof (uint32_t) / 2;
   3348 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
   3349 	byte_width = 2 * width;
   3350 	stride *= 2;
   3351 
   3352         filler = (filler & 0xffff) * 0x00010001;
   3353     }
   3354     else if (bpp == 32)
   3355     {
   3356 	stride = stride * (int) sizeof (uint32_t) / 4;
   3357 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
   3358 	byte_width = 4 * width;
   3359 	stride *= 4;
   3360     }
   3361     else
   3362     {
   3363 	return FALSE;
   3364     }
   3365 
   3366     xmm_def = create_mask_2x32_128 (filler, filler);
   3367 
   3368     while (height--)
   3369     {
   3370 	int w;
   3371 	uint8_t *d = byte_line;
   3372 	byte_line += stride;
   3373 	w = byte_width;
   3374 
   3375 	if (w >= 1 && ((uintptr_t)d & 1))
   3376 	{
   3377 	    *(uint8_t *)d = filler;
   3378 	    w -= 1;
   3379 	    d += 1;
   3380 	}
   3381 
   3382 	while (w >= 2 && ((uintptr_t)d & 3))
   3383 	{
   3384 	    *(uint16_t *)d = filler;
   3385 	    w -= 2;
   3386 	    d += 2;
   3387 	}
   3388 
   3389 	while (w >= 4 && ((uintptr_t)d & 15))
   3390 	{
   3391 	    *(uint32_t *)d = filler;
   3392 
   3393 	    w -= 4;
   3394 	    d += 4;
   3395 	}
   3396 
   3397 	while (w >= 128)
   3398 	{
   3399 	    save_128_aligned ((__m128i*)(d),     xmm_def);
   3400 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
   3401 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
   3402 	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
   3403 	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
   3404 	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
   3405 	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
   3406 	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
   3407 
   3408 	    d += 128;
   3409 	    w -= 128;
   3410 	}
   3411 
   3412 	if (w >= 64)
   3413 	{
   3414 	    save_128_aligned ((__m128i*)(d),     xmm_def);
   3415 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
   3416 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
   3417 	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
   3418 
   3419 	    d += 64;
   3420 	    w -= 64;
   3421 	}
   3422 
   3423 	if (w >= 32)
   3424 	{
   3425 	    save_128_aligned ((__m128i*)(d),     xmm_def);
   3426 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
   3427 
   3428 	    d += 32;
   3429 	    w -= 32;
   3430 	}
   3431 
   3432 	if (w >= 16)
   3433 	{
   3434 	    save_128_aligned ((__m128i*)(d),     xmm_def);
   3435 
   3436 	    d += 16;
   3437 	    w -= 16;
   3438 	}
   3439 
   3440 	while (w >= 4)
   3441 	{
   3442 	    *(uint32_t *)d = filler;
   3443 
   3444 	    w -= 4;
   3445 	    d += 4;
   3446 	}
   3447 
   3448 	if (w >= 2)
   3449 	{
   3450 	    *(uint16_t *)d = filler;
   3451 	    w -= 2;
   3452 	    d += 2;
   3453 	}
   3454 
   3455 	if (w >= 1)
   3456 	{
   3457 	    *(uint8_t *)d = filler;
   3458 	    w -= 1;
   3459 	    d += 1;
   3460 	}
   3461     }
   3462 
   3463     return TRUE;
   3464 }
   3465 
   3466 static void
   3467 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
   3468                              pixman_composite_info_t *info)
   3469 {
   3470     PIXMAN_COMPOSITE_ARGS (info);
   3471     uint32_t src, srca;
   3472     uint32_t    *dst_line, *dst;
   3473     uint8_t     *mask_line, *mask;
   3474     int dst_stride, mask_stride;
   3475     int32_t w;
   3476     uint32_t m;
   3477 
   3478     __m128i xmm_src, xmm_def;
   3479     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   3480 
   3481     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   3482 
   3483     srca = src >> 24;
   3484     if (src == 0)
   3485     {
   3486 	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
   3487 		   PIXMAN_FORMAT_BPP (dest_image->bits.format),
   3488 		   dest_x, dest_y, width, height, 0);
   3489 	return;
   3490     }
   3491 
   3492     PIXMAN_IMAGE_GET_LINE (
   3493 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3494     PIXMAN_IMAGE_GET_LINE (
   3495 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   3496 
   3497     xmm_def = create_mask_2x32_128 (src, src);
   3498     xmm_src = expand_pixel_32_1x128 (src);
   3499 
   3500     while (height--)
   3501     {
   3502 	dst = dst_line;
   3503 	dst_line += dst_stride;
   3504 	mask = mask_line;
   3505 	mask_line += mask_stride;
   3506 	w = width;
   3507 
   3508 	while (w && (uintptr_t)dst & 15)
   3509 	{
   3510 	    uint8_t m = *mask++;
   3511 
   3512 	    if (m)
   3513 	    {
   3514 		*dst = pack_1x128_32 (
   3515 		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
   3516 	    }
   3517 	    else
   3518 	    {
   3519 		*dst = 0;
   3520 	    }
   3521 
   3522 	    w--;
   3523 	    dst++;
   3524 	}
   3525 
   3526 	while (w >= 4)
   3527 	{
   3528 	    m = *((uint32_t*)mask);
   3529 
   3530 	    if (srca == 0xff && m == 0xffffffff)
   3531 	    {
   3532 		save_128_aligned ((__m128i*)dst, xmm_def);
   3533 	    }
   3534 	    else if (m)
   3535 	    {
   3536 		xmm_mask = unpack_32_1x128 (m);
   3537 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
   3538 
   3539 		/* Unpacking */
   3540 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3541 
   3542 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
   3543 					&xmm_mask_lo, &xmm_mask_hi);
   3544 
   3545 		pix_multiply_2x128 (&xmm_src, &xmm_src,
   3546 				    &xmm_mask_lo, &xmm_mask_hi,
   3547 				    &xmm_mask_lo, &xmm_mask_hi);
   3548 
   3549 		save_128_aligned (
   3550 		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
   3551 	    }
   3552 	    else
   3553 	    {
   3554 		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
   3555 	    }
   3556 
   3557 	    w -= 4;
   3558 	    dst += 4;
   3559 	    mask += 4;
   3560 	}
   3561 
   3562 	while (w)
   3563 	{
   3564 	    uint8_t m = *mask++;
   3565 
   3566 	    if (m)
   3567 	    {
   3568 		*dst = pack_1x128_32 (
   3569 		    pix_multiply_1x128 (
   3570 			xmm_src, expand_pixel_8_1x128 (m)));
   3571 	    }
   3572 	    else
   3573 	    {
   3574 		*dst = 0;
   3575 	    }
   3576 
   3577 	    w--;
   3578 	    dst++;
   3579 	}
   3580     }
   3581 
   3582 }
   3583 
   3584 static void
   3585 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
   3586                               pixman_composite_info_t *info)
   3587 {
   3588     PIXMAN_COMPOSITE_ARGS (info);
   3589     uint32_t src;
   3590     uint16_t    *dst_line, *dst, d;
   3591     uint8_t     *mask_line, *mask;
   3592     int dst_stride, mask_stride;
   3593     int32_t w;
   3594     uint32_t m;
   3595     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
   3596 
   3597     __m128i xmm_src, xmm_alpha;
   3598     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   3599     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
   3600 
   3601     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   3602 
   3603     if (src == 0)
   3604 	return;
   3605 
   3606     PIXMAN_IMAGE_GET_LINE (
   3607 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   3608     PIXMAN_IMAGE_GET_LINE (
   3609 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   3610 
   3611     xmm_src = expand_pixel_32_1x128 (src);
   3612     xmm_alpha = expand_alpha_1x128 (xmm_src);
   3613     mmx_src = xmm_src;
   3614     mmx_alpha = xmm_alpha;
   3615 
   3616     while (height--)
   3617     {
   3618 	dst = dst_line;
   3619 	dst_line += dst_stride;
   3620 	mask = mask_line;
   3621 	mask_line += mask_stride;
   3622 	w = width;
   3623 
   3624 	while (w && (uintptr_t)dst & 15)
   3625 	{
   3626 	    m = *mask++;
   3627 
   3628 	    if (m)
   3629 	    {
   3630 		d = *dst;
   3631 		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
   3632 		mmx_dest = expand565_16_1x128 (d);
   3633 
   3634 		*dst = pack_565_32_16 (
   3635 		    pack_1x128_32 (
   3636 			in_over_1x128 (
   3637 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
   3638 	    }
   3639 
   3640 	    w--;
   3641 	    dst++;
   3642 	}
   3643 
   3644 	while (w >= 8)
   3645 	{
   3646 	    xmm_dst = load_128_aligned ((__m128i*) dst);
   3647 	    unpack_565_128_4x128 (xmm_dst,
   3648 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   3649 
   3650 	    m = *((uint32_t*)mask);
   3651 	    mask += 4;
   3652 
   3653 	    if (m)
   3654 	    {
   3655 		xmm_mask = unpack_32_1x128 (m);
   3656 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
   3657 
   3658 		/* Unpacking */
   3659 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3660 
   3661 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
   3662 					&xmm_mask_lo, &xmm_mask_hi);
   3663 
   3664 		in_over_2x128 (&xmm_src, &xmm_src,
   3665 			       &xmm_alpha, &xmm_alpha,
   3666 			       &xmm_mask_lo, &xmm_mask_hi,
   3667 			       &xmm_dst0, &xmm_dst1);
   3668 	    }
   3669 
   3670 	    m = *((uint32_t*)mask);
   3671 	    mask += 4;
   3672 
   3673 	    if (m)
   3674 	    {
   3675 		xmm_mask = unpack_32_1x128 (m);
   3676 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
   3677 
   3678 		/* Unpacking */
   3679 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3680 
   3681 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
   3682 					&xmm_mask_lo, &xmm_mask_hi);
   3683 		in_over_2x128 (&xmm_src, &xmm_src,
   3684 			       &xmm_alpha, &xmm_alpha,
   3685 			       &xmm_mask_lo, &xmm_mask_hi,
   3686 			       &xmm_dst2, &xmm_dst3);
   3687 	    }
   3688 
   3689 	    save_128_aligned (
   3690 		(__m128i*)dst, pack_565_4x128_128 (
   3691 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
   3692 
   3693 	    w -= 8;
   3694 	    dst += 8;
   3695 	}
   3696 
   3697 	while (w)
   3698 	{
   3699 	    m = *mask++;
   3700 
   3701 	    if (m)
   3702 	    {
   3703 		d = *dst;
   3704 		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
   3705 		mmx_dest = expand565_16_1x128 (d);
   3706 
   3707 		*dst = pack_565_32_16 (
   3708 		    pack_1x128_32 (
   3709 			in_over_1x128 (
   3710 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
   3711 	    }
   3712 
   3713 	    w--;
   3714 	    dst++;
   3715 	}
   3716     }
   3717 
   3718 }
   3719 
   3720 static void
   3721 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
   3722                                  pixman_composite_info_t *info)
   3723 {
   3724     PIXMAN_COMPOSITE_ARGS (info);
   3725     uint16_t    *dst_line, *dst, d;
   3726     uint32_t    *src_line, *src, s;
   3727     int dst_stride, src_stride;
   3728     int32_t w;
   3729     uint32_t opaque, zero;
   3730 
   3731     __m128i ms;
   3732     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   3733     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
   3734 
   3735     PIXMAN_IMAGE_GET_LINE (
   3736 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   3737     PIXMAN_IMAGE_GET_LINE (
   3738 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3739 
   3740     while (height--)
   3741     {
   3742 	dst = dst_line;
   3743 	dst_line += dst_stride;
   3744 	src = src_line;
   3745 	src_line += src_stride;
   3746 	w = width;
   3747 
   3748 	while (w && (uintptr_t)dst & 15)
   3749 	{
   3750 	    s = *src++;
   3751 	    d = *dst;
   3752 
   3753 	    ms = unpack_32_1x128 (s);
   3754 
   3755 	    *dst++ = pack_565_32_16 (
   3756 		pack_1x128_32 (
   3757 		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
   3758 	    w--;
   3759 	}
   3760 
   3761 	while (w >= 8)
   3762 	{
   3763 	    /* First round */
   3764 	    xmm_src = load_128_unaligned ((__m128i*)src);
   3765 	    xmm_dst = load_128_aligned  ((__m128i*)dst);
   3766 
   3767 	    opaque = is_opaque (xmm_src);
   3768 	    zero = is_zero (xmm_src);
   3769 
   3770 	    unpack_565_128_4x128 (xmm_dst,
   3771 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   3772 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   3773 
   3774 	    /* preload next round*/
   3775 	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
   3776 
   3777 	    if (opaque)
   3778 	    {
   3779 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
   3780 				     &xmm_dst0, &xmm_dst1);
   3781 	    }
   3782 	    else if (!zero)
   3783 	    {
   3784 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
   3785 					&xmm_dst0, &xmm_dst1);
   3786 	    }
   3787 
   3788 	    /* Second round */
   3789 	    opaque = is_opaque (xmm_src);
   3790 	    zero = is_zero (xmm_src);
   3791 
   3792 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   3793 
   3794 	    if (opaque)
   3795 	    {
   3796 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
   3797 				     &xmm_dst2, &xmm_dst3);
   3798 	    }
   3799 	    else if (!zero)
   3800 	    {
   3801 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
   3802 					&xmm_dst2, &xmm_dst3);
   3803 	    }
   3804 
   3805 	    save_128_aligned (
   3806 		(__m128i*)dst, pack_565_4x128_128 (
   3807 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
   3808 
   3809 	    w -= 8;
   3810 	    src += 8;
   3811 	    dst += 8;
   3812 	}
   3813 
   3814 	while (w)
   3815 	{
   3816 	    s = *src++;
   3817 	    d = *dst;
   3818 
   3819 	    ms = unpack_32_1x128 (s);
   3820 
   3821 	    *dst++ = pack_565_32_16 (
   3822 		pack_1x128_32 (
   3823 		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
   3824 	    w--;
   3825 	}
   3826     }
   3827 
   3828 }
   3829 
   3830 static void
   3831 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
   3832                                  pixman_composite_info_t *info)
   3833 {
   3834     PIXMAN_COMPOSITE_ARGS (info);
   3835     uint32_t    *dst_line, *dst, d;
   3836     uint32_t    *src_line, *src, s;
   3837     int dst_stride, src_stride;
   3838     int32_t w;
   3839     uint32_t opaque, zero;
   3840 
   3841     __m128i xmm_src_lo, xmm_src_hi;
   3842     __m128i xmm_dst_lo, xmm_dst_hi;
   3843 
   3844     PIXMAN_IMAGE_GET_LINE (
   3845 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3846     PIXMAN_IMAGE_GET_LINE (
   3847 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3848 
   3849     while (height--)
   3850     {
   3851 	dst = dst_line;
   3852 	dst_line += dst_stride;
   3853 	src = src_line;
   3854 	src_line += src_stride;
   3855 	w = width;
   3856 
   3857 	while (w && (uintptr_t)dst & 15)
   3858 	{
   3859 	    s = *src++;
   3860 	    d = *dst;
   3861 
   3862 	    *dst++ = pack_1x128_32 (
   3863 		over_rev_non_pre_1x128 (
   3864 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
   3865 
   3866 	    w--;
   3867 	}
   3868 
   3869 	while (w >= 4)
   3870 	{
   3871 	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
   3872 
   3873 	    opaque = is_opaque (xmm_src_hi);
   3874 	    zero = is_zero (xmm_src_hi);
   3875 
   3876 	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   3877 
   3878 	    if (opaque)
   3879 	    {
   3880 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
   3881 				     &xmm_dst_lo, &xmm_dst_hi);
   3882 
   3883 		save_128_aligned (
   3884 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   3885 	    }
   3886 	    else if (!zero)
   3887 	    {
   3888 		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
   3889 
   3890 		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   3891 
   3892 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
   3893 					&xmm_dst_lo, &xmm_dst_hi);
   3894 
   3895 		save_128_aligned (
   3896 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   3897 	    }
   3898 
   3899 	    w -= 4;
   3900 	    dst += 4;
   3901 	    src += 4;
   3902 	}
   3903 
   3904 	while (w)
   3905 	{
   3906 	    s = *src++;
   3907 	    d = *dst;
   3908 
   3909 	    *dst++ = pack_1x128_32 (
   3910 		over_rev_non_pre_1x128 (
   3911 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
   3912 
   3913 	    w--;
   3914 	}
   3915     }
   3916 
   3917 }
   3918 
   3919 static void
   3920 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
   3921                                     pixman_composite_info_t *info)
   3922 {
   3923     PIXMAN_COMPOSITE_ARGS (info);
   3924     uint32_t src;
   3925     uint16_t    *dst_line, *dst, d;
   3926     uint32_t    *mask_line, *mask, m;
   3927     int dst_stride, mask_stride;
   3928     int w;
   3929     uint32_t pack_cmp;
   3930 
   3931     __m128i xmm_src, xmm_alpha;
   3932     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   3933     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
   3934 
   3935     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
   3936 
   3937     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   3938 
   3939     if (src == 0)
   3940 	return;
   3941 
   3942     PIXMAN_IMAGE_GET_LINE (
   3943 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   3944     PIXMAN_IMAGE_GET_LINE (
   3945 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   3946 
   3947     xmm_src = expand_pixel_32_1x128 (src);
   3948     xmm_alpha = expand_alpha_1x128 (xmm_src);
   3949     mmx_src = xmm_src;
   3950     mmx_alpha = xmm_alpha;
   3951 
   3952     while (height--)
   3953     {
   3954 	w = width;
   3955 	mask = mask_line;
   3956 	dst = dst_line;
   3957 	mask_line += mask_stride;
   3958 	dst_line += dst_stride;
   3959 
   3960 	while (w && ((uintptr_t)dst & 15))
   3961 	{
   3962 	    m = *(uint32_t *) mask;
   3963 
   3964 	    if (m)
   3965 	    {
   3966 		d = *dst;
   3967 		mmx_mask = unpack_32_1x128 (m);
   3968 		mmx_dest = expand565_16_1x128 (d);
   3969 
   3970 		*dst = pack_565_32_16 (
   3971 		    pack_1x128_32 (
   3972 			in_over_1x128 (
   3973 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
   3974 	    }
   3975 
   3976 	    w--;
   3977 	    dst++;
   3978 	    mask++;
   3979 	}
   3980 
   3981 	while (w >= 8)
   3982 	{
   3983 	    /* First round */
   3984 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
   3985 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   3986 
   3987 	    pack_cmp = _mm_movemask_epi8 (
   3988 		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
   3989 
   3990 	    unpack_565_128_4x128 (xmm_dst,
   3991 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   3992 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3993 
   3994 	    /* preload next round */
   3995 	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
   3996 
   3997 	    /* preload next round */
   3998 	    if (pack_cmp != 0xffff)
   3999 	    {
   4000 		in_over_2x128 (&xmm_src, &xmm_src,
   4001 			       &xmm_alpha, &xmm_alpha,
   4002 			       &xmm_mask_lo, &xmm_mask_hi,
   4003 			       &xmm_dst0, &xmm_dst1);
   4004 	    }
   4005 
   4006 	    /* Second round */
   4007 	    pack_cmp = _mm_movemask_epi8 (
   4008 		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
   4009 
   4010 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   4011 
   4012 	    if (pack_cmp != 0xffff)
   4013 	    {
   4014 		in_over_2x128 (&xmm_src, &xmm_src,
   4015 			       &xmm_alpha, &xmm_alpha,
   4016 			       &xmm_mask_lo, &xmm_mask_hi,
   4017 			       &xmm_dst2, &xmm_dst3);
   4018 	    }
   4019 
   4020 	    save_128_aligned (
   4021 		(__m128i*)dst, pack_565_4x128_128 (
   4022 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
   4023 
   4024 	    w -= 8;
   4025 	    dst += 8;
   4026 	    mask += 8;
   4027 	}
   4028 
   4029 	while (w)
   4030 	{
   4031 	    m = *(uint32_t *) mask;
   4032 
   4033 	    if (m)
   4034 	    {
   4035 		d = *dst;
   4036 		mmx_mask = unpack_32_1x128 (m);
   4037 		mmx_dest = expand565_16_1x128 (d);
   4038 
   4039 		*dst = pack_565_32_16 (
   4040 		    pack_1x128_32 (
   4041 			in_over_1x128 (
   4042 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
   4043 	    }
   4044 
   4045 	    w--;
   4046 	    dst++;
   4047 	    mask++;
   4048 	}
   4049     }
   4050 
   4051 }
   4052 
   4053 static void
   4054 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
   4055                          pixman_composite_info_t *info)
   4056 {
   4057     PIXMAN_COMPOSITE_ARGS (info);
   4058     uint8_t     *dst_line, *dst;
   4059     uint8_t     *mask_line, *mask;
   4060     int dst_stride, mask_stride;
   4061     uint32_t d, m;
   4062     uint32_t src;
   4063     int32_t w;
   4064 
   4065     __m128i xmm_alpha;
   4066     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   4067     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4068 
   4069     PIXMAN_IMAGE_GET_LINE (
   4070 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4071     PIXMAN_IMAGE_GET_LINE (
   4072 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   4073 
   4074     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4075 
   4076     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
   4077 
   4078     while (height--)
   4079     {
   4080 	dst = dst_line;
   4081 	dst_line += dst_stride;
   4082 	mask = mask_line;
   4083 	mask_line += mask_stride;
   4084 	w = width;
   4085 
   4086 	while (w && ((uintptr_t)dst & 15))
   4087 	{
   4088 	    m = (uint32_t) *mask++;
   4089 	    d = (uint32_t) *dst;
   4090 
   4091 	    *dst++ = (uint8_t) pack_1x128_32 (
   4092 		pix_multiply_1x128 (
   4093 		    pix_multiply_1x128 (xmm_alpha,
   4094 				       unpack_32_1x128 (m)),
   4095 		    unpack_32_1x128 (d)));
   4096 	    w--;
   4097 	}
   4098 
   4099 	while (w >= 16)
   4100 	{
   4101 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
   4102 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   4103 
   4104 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   4105 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4106 
   4107 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
   4108 				&xmm_mask_lo, &xmm_mask_hi,
   4109 				&xmm_mask_lo, &xmm_mask_hi);
   4110 
   4111 	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   4112 				&xmm_dst_lo, &xmm_dst_hi,
   4113 				&xmm_dst_lo, &xmm_dst_hi);
   4114 
   4115 	    save_128_aligned (
   4116 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4117 
   4118 	    mask += 16;
   4119 	    dst += 16;
   4120 	    w -= 16;
   4121 	}
   4122 
   4123 	while (w)
   4124 	{
   4125 	    m = (uint32_t) *mask++;
   4126 	    d = (uint32_t) *dst;
   4127 
   4128 	    *dst++ = (uint8_t) pack_1x128_32 (
   4129 		pix_multiply_1x128 (
   4130 		    pix_multiply_1x128 (
   4131 			xmm_alpha, unpack_32_1x128 (m)),
   4132 		    unpack_32_1x128 (d)));
   4133 	    w--;
   4134 	}
   4135     }
   4136 
   4137 }
   4138 
   4139 static void
   4140 sse2_composite_in_n_8 (pixman_implementation_t *imp,
   4141 		       pixman_composite_info_t *info)
   4142 {
   4143     PIXMAN_COMPOSITE_ARGS (info);
   4144     uint8_t     *dst_line, *dst;
   4145     int dst_stride;
   4146     uint32_t d;
   4147     uint32_t src;
   4148     int32_t w;
   4149 
   4150     __m128i xmm_alpha;
   4151     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4152 
   4153     PIXMAN_IMAGE_GET_LINE (
   4154 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4155 
   4156     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4157 
   4158     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
   4159 
   4160     src = src >> 24;
   4161 
   4162     if (src == 0xff)
   4163 	return;
   4164 
   4165     if (src == 0x00)
   4166     {
   4167 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
   4168 		     8, dest_x, dest_y, width, height, src);
   4169 
   4170 	return;
   4171     }
   4172 
   4173     while (height--)
   4174     {
   4175 	dst = dst_line;
   4176 	dst_line += dst_stride;
   4177 	w = width;
   4178 
   4179 	while (w && ((uintptr_t)dst & 15))
   4180 	{
   4181 	    d = (uint32_t) *dst;
   4182 
   4183 	    *dst++ = (uint8_t) pack_1x128_32 (
   4184 		pix_multiply_1x128 (
   4185 		    xmm_alpha,
   4186 		    unpack_32_1x128 (d)));
   4187 	    w--;
   4188 	}
   4189 
   4190 	while (w >= 16)
   4191 	{
   4192 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   4193 
   4194 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4195 
   4196 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
   4197 				&xmm_dst_lo, &xmm_dst_hi,
   4198 				&xmm_dst_lo, &xmm_dst_hi);
   4199 
   4200 	    save_128_aligned (
   4201 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4202 
   4203 	    dst += 16;
   4204 	    w -= 16;
   4205 	}
   4206 
   4207 	while (w)
   4208 	{
   4209 	    d = (uint32_t) *dst;
   4210 
   4211 	    *dst++ = (uint8_t) pack_1x128_32 (
   4212 		pix_multiply_1x128 (
   4213 		    xmm_alpha,
   4214 		    unpack_32_1x128 (d)));
   4215 	    w--;
   4216 	}
   4217     }
   4218 
   4219 }
   4220 
   4221 static void
   4222 sse2_composite_in_8_8 (pixman_implementation_t *imp,
   4223                        pixman_composite_info_t *info)
   4224 {
   4225     PIXMAN_COMPOSITE_ARGS (info);
   4226     uint8_t     *dst_line, *dst;
   4227     uint8_t     *src_line, *src;
   4228     int src_stride, dst_stride;
   4229     int32_t w;
   4230     uint32_t s, d;
   4231 
   4232     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   4233     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4234 
   4235     PIXMAN_IMAGE_GET_LINE (
   4236 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4237     PIXMAN_IMAGE_GET_LINE (
   4238 	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
   4239 
   4240     while (height--)
   4241     {
   4242 	dst = dst_line;
   4243 	dst_line += dst_stride;
   4244 	src = src_line;
   4245 	src_line += src_stride;
   4246 	w = width;
   4247 
   4248 	while (w && ((uintptr_t)dst & 15))
   4249 	{
   4250 	    s = (uint32_t) *src++;
   4251 	    d = (uint32_t) *dst;
   4252 
   4253 	    *dst++ = (uint8_t) pack_1x128_32 (
   4254 		pix_multiply_1x128 (
   4255 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
   4256 	    w--;
   4257 	}
   4258 
   4259 	while (w >= 16)
   4260 	{
   4261 	    xmm_src = load_128_unaligned ((__m128i*)src);
   4262 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   4263 
   4264 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   4265 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4266 
   4267 	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   4268 				&xmm_dst_lo, &xmm_dst_hi,
   4269 				&xmm_dst_lo, &xmm_dst_hi);
   4270 
   4271 	    save_128_aligned (
   4272 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4273 
   4274 	    src += 16;
   4275 	    dst += 16;
   4276 	    w -= 16;
   4277 	}
   4278 
   4279 	while (w)
   4280 	{
   4281 	    s = (uint32_t) *src++;
   4282 	    d = (uint32_t) *dst;
   4283 
   4284 	    *dst++ = (uint8_t) pack_1x128_32 (
   4285 		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
   4286 	    w--;
   4287 	}
   4288     }
   4289 
   4290 }
   4291 
   4292 static void
   4293 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
   4294 			  pixman_composite_info_t *info)
   4295 {
   4296     PIXMAN_COMPOSITE_ARGS (info);
   4297     uint8_t     *dst_line, *dst;
   4298     uint8_t     *mask_line, *mask;
   4299     int dst_stride, mask_stride;
   4300     int32_t w;
   4301     uint32_t src;
   4302     uint32_t m, d;
   4303 
   4304     __m128i xmm_alpha;
   4305     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   4306     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4307 
   4308     PIXMAN_IMAGE_GET_LINE (
   4309 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4310     PIXMAN_IMAGE_GET_LINE (
   4311 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   4312 
   4313     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4314 
   4315     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
   4316 
   4317     while (height--)
   4318     {
   4319 	dst = dst_line;
   4320 	dst_line += dst_stride;
   4321 	mask = mask_line;
   4322 	mask_line += mask_stride;
   4323 	w = width;
   4324 
   4325 	while (w && ((uintptr_t)dst & 15))
   4326 	{
   4327 	    m = (uint32_t) *mask++;
   4328 	    d = (uint32_t) *dst;
   4329 
   4330 	    *dst++ = (uint8_t) pack_1x128_32 (
   4331 		_mm_adds_epu16 (
   4332 		    pix_multiply_1x128 (
   4333 			xmm_alpha, unpack_32_1x128 (m)),
   4334 		    unpack_32_1x128 (d)));
   4335 	    w--;
   4336 	}
   4337 
   4338 	while (w >= 16)
   4339 	{
   4340 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
   4341 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   4342 
   4343 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   4344 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4345 
   4346 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
   4347 				&xmm_mask_lo, &xmm_mask_hi,
   4348 				&xmm_mask_lo, &xmm_mask_hi);
   4349 
   4350 	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
   4351 	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
   4352 
   4353 	    save_128_aligned (
   4354 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4355 
   4356 	    mask += 16;
   4357 	    dst += 16;
   4358 	    w -= 16;
   4359 	}
   4360 
   4361 	while (w)
   4362 	{
   4363 	    m = (uint32_t) *mask++;
   4364 	    d = (uint32_t) *dst;
   4365 
   4366 	    *dst++ = (uint8_t) pack_1x128_32 (
   4367 		_mm_adds_epu16 (
   4368 		    pix_multiply_1x128 (
   4369 			xmm_alpha, unpack_32_1x128 (m)),
   4370 		    unpack_32_1x128 (d)));
   4371 
   4372 	    w--;
   4373 	}
   4374     }
   4375 
   4376 }
   4377 
   4378 static void
   4379 sse2_composite_add_n_8 (pixman_implementation_t *imp,
   4380 			pixman_composite_info_t *info)
   4381 {
   4382     PIXMAN_COMPOSITE_ARGS (info);
   4383     uint8_t     *dst_line, *dst;
   4384     int dst_stride;
   4385     int32_t w;
   4386     uint32_t src;
   4387 
   4388     __m128i xmm_src;
   4389 
   4390     PIXMAN_IMAGE_GET_LINE (
   4391 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4392 
   4393     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4394 
   4395     src >>= 24;
   4396 
   4397     if (src == 0x00)
   4398 	return;
   4399 
   4400     if (src == 0xff)
   4401     {
   4402 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
   4403 		     8, dest_x, dest_y, width, height, 0xff);
   4404 
   4405 	return;
   4406     }
   4407 
   4408     src = (src << 24) | (src << 16) | (src << 8) | src;
   4409     xmm_src = _mm_set_epi32 (src, src, src, src);
   4410 
   4411     while (height--)
   4412     {
   4413 	dst = dst_line;
   4414 	dst_line += dst_stride;
   4415 	w = width;
   4416 
   4417 	while (w && ((uintptr_t)dst & 15))
   4418 	{
   4419 	    *dst = (uint8_t)_mm_cvtsi128_si32 (
   4420 		_mm_adds_epu8 (
   4421 		    xmm_src,
   4422 		    _mm_cvtsi32_si128 (*dst)));
   4423 
   4424 	    w--;
   4425 	    dst++;
   4426 	}
   4427 
   4428 	while (w >= 16)
   4429 	{
   4430 	    save_128_aligned (
   4431 		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
   4432 
   4433 	    dst += 16;
   4434 	    w -= 16;
   4435 	}
   4436 
   4437 	while (w)
   4438 	{
   4439 	    *dst = (uint8_t)_mm_cvtsi128_si32 (
   4440 		_mm_adds_epu8 (
   4441 		    xmm_src,
   4442 		    _mm_cvtsi32_si128 (*dst)));
   4443 
   4444 	    w--;
   4445 	    dst++;
   4446 	}
   4447     }
   4448 
   4449 }
   4450 
   4451 static void
   4452 sse2_composite_add_8_8 (pixman_implementation_t *imp,
   4453 			pixman_composite_info_t *info)
   4454 {
   4455     PIXMAN_COMPOSITE_ARGS (info);
   4456     uint8_t     *dst_line, *dst;
   4457     uint8_t     *src_line, *src;
   4458     int dst_stride, src_stride;
   4459     int32_t w;
   4460     uint16_t t;
   4461 
   4462     PIXMAN_IMAGE_GET_LINE (
   4463 	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
   4464     PIXMAN_IMAGE_GET_LINE (
   4465 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4466 
   4467     while (height--)
   4468     {
   4469 	dst = dst_line;
   4470 	src = src_line;
   4471 
   4472 	dst_line += dst_stride;
   4473 	src_line += src_stride;
   4474 	w = width;
   4475 
   4476 	/* Small head */
   4477 	while (w && (uintptr_t)dst & 3)
   4478 	{
   4479 	    t = (*dst) + (*src++);
   4480 	    *dst++ = t | (0 - (t >> 8));
   4481 	    w--;
   4482 	}
   4483 
   4484 	sse2_combine_add_u (imp, op,
   4485 			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
   4486 
   4487 	/* Small tail */
   4488 	dst += w & 0xfffc;
   4489 	src += w & 0xfffc;
   4490 
   4491 	w &= 3;
   4492 
   4493 	while (w)
   4494 	{
   4495 	    t = (*dst) + (*src++);
   4496 	    *dst++ = t | (0 - (t >> 8));
   4497 	    w--;
   4498 	}
   4499     }
   4500 
   4501 }
   4502 
   4503 static void
   4504 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
   4505                               pixman_composite_info_t *info)
   4506 {
   4507     PIXMAN_COMPOSITE_ARGS (info);
   4508     uint32_t    *dst_line, *dst;
   4509     uint32_t    *src_line, *src;
   4510     int dst_stride, src_stride;
   4511 
   4512     PIXMAN_IMAGE_GET_LINE (
   4513 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   4514     PIXMAN_IMAGE_GET_LINE (
   4515 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   4516 
   4517     while (height--)
   4518     {
   4519 	dst = dst_line;
   4520 	dst_line += dst_stride;
   4521 	src = src_line;
   4522 	src_line += src_stride;
   4523 
   4524 	sse2_combine_add_u (imp, op, dst, src, NULL, width);
   4525     }
   4526 }
   4527 
   4528 static void
   4529 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
   4530 			   pixman_composite_info_t *info)
   4531 {
   4532     PIXMAN_COMPOSITE_ARGS (info);
   4533     uint32_t *dst_line, *dst, src;
   4534     int dst_stride;
   4535 
   4536     __m128i xmm_src;
   4537 
   4538     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   4539 
   4540     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4541     if (src == 0)
   4542 	return;
   4543 
   4544     if (src == ~0)
   4545     {
   4546 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
   4547 		     dest_x, dest_y, width, height, ~0);
   4548 
   4549 	return;
   4550     }
   4551 
   4552     xmm_src = _mm_set_epi32 (src, src, src, src);
   4553     while (height--)
   4554     {
   4555 	int w = width;
   4556 	uint32_t d;
   4557 
   4558 	dst = dst_line;
   4559 	dst_line += dst_stride;
   4560 
   4561 	while (w && (uintptr_t)dst & 15)
   4562 	{
   4563 	    d = *dst;
   4564 	    *dst++ =
   4565 		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
   4566 	    w--;
   4567 	}
   4568 
   4569 	while (w >= 4)
   4570 	{
   4571 	    save_128_aligned
   4572 		((__m128i*)dst,
   4573 		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
   4574 
   4575 	    dst += 4;
   4576 	    w -= 4;
   4577 	}
   4578 
   4579 	while (w--)
   4580 	{
   4581 	    d = *dst;
   4582 	    *dst++ =
   4583 		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
   4584 						  _mm_cvtsi32_si128 (d)));
   4585 	}
   4586     }
   4587 }
   4588 
   4589 static void
   4590 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
   4591 			     pixman_composite_info_t *info)
   4592 {
   4593     PIXMAN_COMPOSITE_ARGS (info);
   4594     uint32_t     *dst_line, *dst;
   4595     uint8_t     *mask_line, *mask;
   4596     int dst_stride, mask_stride;
   4597     int32_t w;
   4598     uint32_t src;
   4599 
   4600     __m128i xmm_src;
   4601 
   4602     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4603     if (src == 0)
   4604 	return;
   4605     xmm_src = expand_pixel_32_1x128 (src);
   4606 
   4607     PIXMAN_IMAGE_GET_LINE (
   4608 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   4609     PIXMAN_IMAGE_GET_LINE (
   4610 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   4611 
   4612     while (height--)
   4613     {
   4614 	dst = dst_line;
   4615 	dst_line += dst_stride;
   4616 	mask = mask_line;
   4617 	mask_line += mask_stride;
   4618 	w = width;
   4619 
   4620 	while (w && ((uintptr_t)dst & 15))
   4621 	{
   4622 	    uint8_t m = *mask++;
   4623 	    if (m)
   4624 	    {
   4625 		*dst = pack_1x128_32
   4626 		    (_mm_adds_epu16
   4627 		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
   4628 		      unpack_32_1x128 (*dst)));
   4629 	    }
   4630 	    dst++;
   4631 	    w--;
   4632 	}
   4633 
   4634 	while (w >= 4)
   4635 	{
   4636 	    uint32_t m = *(uint32_t*)mask;
   4637 	    if (m)
   4638 	    {
   4639 		__m128i xmm_mask_lo, xmm_mask_hi;
   4640 		__m128i xmm_dst_lo, xmm_dst_hi;
   4641 
   4642 		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
   4643 		__m128i xmm_mask =
   4644 		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
   4645 				       _mm_setzero_si128 ());
   4646 
   4647 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   4648 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4649 
   4650 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
   4651 					&xmm_mask_lo, &xmm_mask_hi);
   4652 
   4653 		pix_multiply_2x128 (&xmm_src, &xmm_src,
   4654 				    &xmm_mask_lo, &xmm_mask_hi,
   4655 				    &xmm_mask_lo, &xmm_mask_hi);
   4656 
   4657 		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
   4658 		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
   4659 
   4660 		save_128_aligned (
   4661 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4662 	    }
   4663 
   4664 	    w -= 4;
   4665 	    dst += 4;
   4666 	    mask += 4;
   4667 	}
   4668 
   4669 	while (w)
   4670 	{
   4671 	    uint8_t m = *mask++;
   4672 	    if (m)
   4673 	    {
   4674 		*dst = pack_1x128_32
   4675 		    (_mm_adds_epu16
   4676 		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
   4677 		      unpack_32_1x128 (*dst)));
   4678 	    }
   4679 	    dst++;
   4680 	    w--;
   4681 	}
   4682     }
   4683 }
   4684 
   4685 static pixman_bool_t
   4686 sse2_blt (pixman_implementation_t *imp,
   4687           uint32_t *               src_bits,
   4688           uint32_t *               dst_bits,
   4689           int                      src_stride,
   4690           int                      dst_stride,
   4691           int                      src_bpp,
   4692           int                      dst_bpp,
   4693           int                      src_x,
   4694           int                      src_y,
   4695           int                      dest_x,
   4696           int                      dest_y,
   4697           int                      width,
   4698           int                      height)
   4699 {
   4700     uint8_t *   src_bytes;
   4701     uint8_t *   dst_bytes;
   4702     int byte_width;
   4703 
   4704     if (src_bpp != dst_bpp)
   4705 	return FALSE;
   4706 
   4707     if (src_bpp == 16)
   4708     {
   4709 	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
   4710 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
   4711 	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
   4712 	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
   4713 	byte_width = 2 * width;
   4714 	src_stride *= 2;
   4715 	dst_stride *= 2;
   4716     }
   4717     else if (src_bpp == 32)
   4718     {
   4719 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
   4720 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
   4721 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
   4722 	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
   4723 	byte_width = 4 * width;
   4724 	src_stride *= 4;
   4725 	dst_stride *= 4;
   4726     }
   4727     else
   4728     {
   4729 	return FALSE;
   4730     }
   4731 
   4732     while (height--)
   4733     {
   4734 	int w;
   4735 	uint8_t *s = src_bytes;
   4736 	uint8_t *d = dst_bytes;
   4737 	src_bytes += src_stride;
   4738 	dst_bytes += dst_stride;
   4739 	w = byte_width;
   4740 
   4741 	while (w >= 2 && ((uintptr_t)d & 3))
   4742 	{
   4743 	    *(uint16_t *)d = *(uint16_t *)s;
   4744 	    w -= 2;
   4745 	    s += 2;
   4746 	    d += 2;
   4747 	}
   4748 
   4749 	while (w >= 4 && ((uintptr_t)d & 15))
   4750 	{
   4751 	    *(uint32_t *)d = *(uint32_t *)s;
   4752 
   4753 	    w -= 4;
   4754 	    s += 4;
   4755 	    d += 4;
   4756 	}
   4757 
   4758 	while (w >= 64)
   4759 	{
   4760 	    __m128i xmm0, xmm1, xmm2, xmm3;
   4761 
   4762 	    xmm0 = load_128_unaligned ((__m128i*)(s));
   4763 	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
   4764 	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
   4765 	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
   4766 
   4767 	    save_128_aligned ((__m128i*)(d),    xmm0);
   4768 	    save_128_aligned ((__m128i*)(d + 16), xmm1);
   4769 	    save_128_aligned ((__m128i*)(d + 32), xmm2);
   4770 	    save_128_aligned ((__m128i*)(d + 48), xmm3);
   4771 
   4772 	    s += 64;
   4773 	    d += 64;
   4774 	    w -= 64;
   4775 	}
   4776 
   4777 	while (w >= 16)
   4778 	{
   4779 	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
   4780 
   4781 	    w -= 16;
   4782 	    d += 16;
   4783 	    s += 16;
   4784 	}
   4785 
   4786 	while (w >= 4)
   4787 	{
   4788 	    *(uint32_t *)d = *(uint32_t *)s;
   4789 
   4790 	    w -= 4;
   4791 	    s += 4;
   4792 	    d += 4;
   4793 	}
   4794 
   4795 	if (w >= 2)
   4796 	{
   4797 	    *(uint16_t *)d = *(uint16_t *)s;
   4798 	    w -= 2;
   4799 	    s += 2;
   4800 	    d += 2;
   4801 	}
   4802     }
   4803 
   4804     return TRUE;
   4805 }
   4806 
   4807 static void
   4808 sse2_composite_copy_area (pixman_implementation_t *imp,
   4809                           pixman_composite_info_t *info)
   4810 {
   4811     PIXMAN_COMPOSITE_ARGS (info);
   4812     sse2_blt (imp, src_image->bits.bits,
   4813 	      dest_image->bits.bits,
   4814 	      src_image->bits.rowstride,
   4815 	      dest_image->bits.rowstride,
   4816 	      PIXMAN_FORMAT_BPP (src_image->bits.format),
   4817 	      PIXMAN_FORMAT_BPP (dest_image->bits.format),
   4818 	      src_x, src_y, dest_x, dest_y, width, height);
   4819 }
   4820 
   4821 static void
   4822 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
   4823                                  pixman_composite_info_t *info)
   4824 {
   4825     PIXMAN_COMPOSITE_ARGS (info);
   4826     uint32_t    *src, *src_line, s;
   4827     uint32_t    *dst, *dst_line, d;
   4828     uint8_t         *mask, *mask_line;
   4829     uint32_t m;
   4830     int src_stride, mask_stride, dst_stride;
   4831     int32_t w;
   4832     __m128i ms;
   4833 
   4834     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   4835     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4836     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   4837 
   4838     PIXMAN_IMAGE_GET_LINE (
   4839 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   4840     PIXMAN_IMAGE_GET_LINE (
   4841 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   4842     PIXMAN_IMAGE_GET_LINE (
   4843 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   4844 
   4845     while (height--)
   4846     {
   4847         src = src_line;
   4848         src_line += src_stride;
   4849         dst = dst_line;
   4850         dst_line += dst_stride;
   4851         mask = mask_line;
   4852         mask_line += mask_stride;
   4853 
   4854         w = width;
   4855 
   4856         while (w && (uintptr_t)dst & 15)
   4857         {
   4858             s = 0xff000000 | *src++;
   4859             m = (uint32_t) *mask++;
   4860             d = *dst;
   4861             ms = unpack_32_1x128 (s);
   4862 
   4863             if (m != 0xff)
   4864             {
   4865 		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
   4866 		__m128i md = unpack_32_1x128 (d);
   4867 
   4868                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
   4869             }
   4870 
   4871             *dst++ = pack_1x128_32 (ms);
   4872             w--;
   4873         }
   4874 
   4875         while (w >= 4)
   4876         {
   4877             m = *(uint32_t*) mask;
   4878             xmm_src = _mm_or_si128 (
   4879 		load_128_unaligned ((__m128i*)src), mask_ff000000);
   4880 
   4881             if (m == 0xffffffff)
   4882             {
   4883                 save_128_aligned ((__m128i*)dst, xmm_src);
   4884             }
   4885             else
   4886             {
   4887                 xmm_dst = load_128_aligned ((__m128i*)dst);
   4888 
   4889                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
   4890 
   4891                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   4892                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   4893                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4894 
   4895                 expand_alpha_rev_2x128 (
   4896 		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   4897 
   4898                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   4899 			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
   4900 			       &xmm_dst_lo, &xmm_dst_hi);
   4901 
   4902                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4903             }
   4904 
   4905             src += 4;
   4906             dst += 4;
   4907             mask += 4;
   4908             w -= 4;
   4909         }
   4910 
   4911         while (w)
   4912         {
   4913             m = (uint32_t) *mask++;
   4914 
   4915             if (m)
   4916             {
   4917                 s = 0xff000000 | *src;
   4918 
   4919                 if (m == 0xff)
   4920                 {
   4921                     *dst = s;
   4922                 }
   4923                 else
   4924                 {
   4925 		    __m128i ma, md, ms;
   4926 
   4927                     d = *dst;
   4928 
   4929 		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
   4930 		    md = unpack_32_1x128 (d);
   4931 		    ms = unpack_32_1x128 (s);
   4932 
   4933                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
   4934                 }
   4935 
   4936             }
   4937 
   4938             src++;
   4939             dst++;
   4940             w--;
   4941         }
   4942     }
   4943 
   4944 }
   4945 
   4946 static void
   4947 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
   4948                                  pixman_composite_info_t *info)
   4949 {
   4950     PIXMAN_COMPOSITE_ARGS (info);
   4951     uint32_t    *src, *src_line, s;
   4952     uint32_t    *dst, *dst_line, d;
   4953     uint8_t         *mask, *mask_line;
   4954     uint32_t m;
   4955     int src_stride, mask_stride, dst_stride;
   4956     int32_t w;
   4957 
   4958     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
   4959     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4960     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   4961 
   4962     PIXMAN_IMAGE_GET_LINE (
   4963 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   4964     PIXMAN_IMAGE_GET_LINE (
   4965 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   4966     PIXMAN_IMAGE_GET_LINE (
   4967 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   4968 
   4969     while (height--)
   4970     {
   4971         src = src_line;
   4972         src_line += src_stride;
   4973         dst = dst_line;
   4974         dst_line += dst_stride;
   4975         mask = mask_line;
   4976         mask_line += mask_stride;
   4977 
   4978         w = width;
   4979 
   4980         while (w && (uintptr_t)dst & 15)
   4981         {
   4982 	    uint32_t sa;
   4983 
   4984             s = *src++;
   4985             m = (uint32_t) *mask++;
   4986             d = *dst;
   4987 
   4988 	    sa = s >> 24;
   4989 
   4990 	    if (m)
   4991 	    {
   4992 		if (sa == 0xff && m == 0xff)
   4993 		{
   4994 		    *dst = s;
   4995 		}
   4996 		else
   4997 		{
   4998 		    __m128i ms, md, ma, msa;
   4999 
   5000 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   5001 		    ms = unpack_32_1x128 (s);
   5002 		    md = unpack_32_1x128 (d);
   5003 
   5004 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   5005 
   5006 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   5007 		}
   5008 	    }
   5009 
   5010 	    dst++;
   5011             w--;
   5012         }
   5013 
   5014         while (w >= 4)
   5015         {
   5016             m = *(uint32_t *) mask;
   5017 
   5018 	    if (m)
   5019 	    {
   5020 		xmm_src = load_128_unaligned ((__m128i*)src);
   5021 
   5022 		if (m == 0xffffffff && is_opaque (xmm_src))
   5023 		{
   5024 		    save_128_aligned ((__m128i *)dst, xmm_src);
   5025 		}
   5026 		else
   5027 		{
   5028 		    xmm_dst = load_128_aligned ((__m128i *)dst);
   5029 
   5030 		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
   5031 
   5032 		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   5033 		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   5034 		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5035 
   5036 		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
   5037 		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   5038 
   5039 		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
   5040 				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
   5041 
   5042 		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5043 		}
   5044 	    }
   5045 
   5046             src += 4;
   5047             dst += 4;
   5048             mask += 4;
   5049             w -= 4;
   5050         }
   5051 
   5052         while (w)
   5053         {
   5054 	    uint32_t sa;
   5055 
   5056             s = *src++;
   5057             m = (uint32_t) *mask++;
   5058             d = *dst;
   5059 
   5060 	    sa = s >> 24;
   5061 
   5062 	    if (m)
   5063 	    {
   5064 		if (sa == 0xff && m == 0xff)
   5065 		{
   5066 		    *dst = s;
   5067 		}
   5068 		else
   5069 		{
   5070 		    __m128i ms, md, ma, msa;
   5071 
   5072 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   5073 		    ms = unpack_32_1x128 (s);
   5074 		    md = unpack_32_1x128 (d);
   5075 
   5076 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   5077 
   5078 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   5079 		}
   5080 	    }
   5081 
   5082 	    dst++;
   5083             w--;
   5084         }
   5085     }
   5086 
   5087 }
   5088 
   5089 static void
   5090 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
   5091 				    pixman_composite_info_t *info)
   5092 {
   5093     PIXMAN_COMPOSITE_ARGS (info);
   5094     uint32_t src;
   5095     uint32_t    *dst_line, *dst;
   5096     __m128i xmm_src;
   5097     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   5098     __m128i xmm_dsta_hi, xmm_dsta_lo;
   5099     int dst_stride;
   5100     int32_t w;
   5101 
   5102     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   5103 
   5104     if (src == 0)
   5105 	return;
   5106 
   5107     PIXMAN_IMAGE_GET_LINE (
   5108 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   5109 
   5110     xmm_src = expand_pixel_32_1x128 (src);
   5111 
   5112     while (height--)
   5113     {
   5114 	dst = dst_line;
   5115 
   5116 	dst_line += dst_stride;
   5117 	w = width;
   5118 
   5119 	while (w && (uintptr_t)dst & 15)
   5120 	{
   5121 	    __m128i vd;
   5122 
   5123 	    vd = unpack_32_1x128 (*dst);
   5124 
   5125 	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
   5126 					      xmm_src));
   5127 	    w--;
   5128 	    dst++;
   5129 	}
   5130 
   5131 	while (w >= 4)
   5132 	{
   5133 	    __m128i tmp_lo, tmp_hi;
   5134 
   5135 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   5136 
   5137 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5138 	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
   5139 
   5140 	    tmp_lo = xmm_src;
   5141 	    tmp_hi = xmm_src;
   5142 
   5143 	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   5144 			&xmm_dsta_lo, &xmm_dsta_hi,
   5145 			&tmp_lo, &tmp_hi);
   5146 
   5147 	    save_128_aligned (
   5148 		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
   5149 
   5150 	    w -= 4;
   5151 	    dst += 4;
   5152 	}
   5153 
   5154 	while (w)
   5155 	{
   5156 	    __m128i vd;
   5157 
   5158 	    vd = unpack_32_1x128 (*dst);
   5159 
   5160 	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
   5161 					      xmm_src));
   5162 	    w--;
   5163 	    dst++;
   5164 	}
   5165 
   5166     }
   5167 
   5168 }
   5169 
   5170 static void
   5171 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
   5172 				    pixman_composite_info_t *info)
   5173 {
   5174     PIXMAN_COMPOSITE_ARGS (info);
   5175     uint32_t    *src, *src_line, s;
   5176     uint32_t    *dst, *dst_line, d;
   5177     uint32_t    *mask, *mask_line;
   5178     uint32_t    m;
   5179     int src_stride, mask_stride, dst_stride;
   5180     int32_t w;
   5181 
   5182     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
   5183     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   5184     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   5185 
   5186     PIXMAN_IMAGE_GET_LINE (
   5187 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   5188     PIXMAN_IMAGE_GET_LINE (
   5189 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   5190     PIXMAN_IMAGE_GET_LINE (
   5191 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   5192 
   5193     while (height--)
   5194     {
   5195         src = src_line;
   5196         src_line += src_stride;
   5197         dst = dst_line;
   5198         dst_line += dst_stride;
   5199         mask = mask_line;
   5200         mask_line += mask_stride;
   5201 
   5202         w = width;
   5203 
   5204         while (w && (uintptr_t)dst & 15)
   5205         {
   5206 	    uint32_t sa;
   5207 
   5208             s = *src++;
   5209             m = (*mask++) >> 24;
   5210             d = *dst;
   5211 
   5212 	    sa = s >> 24;
   5213 
   5214 	    if (m)
   5215 	    {
   5216 		if (sa == 0xff && m == 0xff)
   5217 		{
   5218 		    *dst = s;
   5219 		}
   5220 		else
   5221 		{
   5222 		    __m128i ms, md, ma, msa;
   5223 
   5224 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   5225 		    ms = unpack_32_1x128 (s);
   5226 		    md = unpack_32_1x128 (d);
   5227 
   5228 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   5229 
   5230 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   5231 		}
   5232 	    }
   5233 
   5234 	    dst++;
   5235             w--;
   5236         }
   5237 
   5238         while (w >= 4)
   5239         {
   5240 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
   5241 
   5242 	    if (!is_transparent (xmm_mask))
   5243 	    {
   5244 		xmm_src = load_128_unaligned ((__m128i*)src);
   5245 
   5246 		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
   5247 		{
   5248 		    save_128_aligned ((__m128i *)dst, xmm_src);
   5249 		}
   5250 		else
   5251 		{
   5252 		    xmm_dst = load_128_aligned ((__m128i *)dst);
   5253 
   5254 		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   5255 		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   5256 		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5257 
   5258 		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
   5259 		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   5260 
   5261 		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
   5262 				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
   5263 
   5264 		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5265 		}
   5266 	    }
   5267 
   5268             src += 4;
   5269             dst += 4;
   5270             mask += 4;
   5271             w -= 4;
   5272         }
   5273 
   5274         while (w)
   5275         {
   5276 	    uint32_t sa;
   5277 
   5278             s = *src++;
   5279             m = (*mask++) >> 24;
   5280             d = *dst;
   5281 
   5282 	    sa = s >> 24;
   5283 
   5284 	    if (m)
   5285 	    {
   5286 		if (sa == 0xff && m == 0xff)
   5287 		{
   5288 		    *dst = s;
   5289 		}
   5290 		else
   5291 		{
   5292 		    __m128i ms, md, ma, msa;
   5293 
   5294 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   5295 		    ms = unpack_32_1x128 (s);
   5296 		    md = unpack_32_1x128 (d);
   5297 
   5298 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   5299 
   5300 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   5301 		}
   5302 	    }
   5303 
   5304 	    dst++;
   5305             w--;
   5306         }
   5307     }
   5308 
   5309 }
   5310 
   5311 /* A variant of 'sse2_combine_over_u' with minor tweaks */
   5312 static force_inline void
   5313 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
   5314                                              const uint32_t* ps,
   5315                                              int32_t         w,
   5316                                              pixman_fixed_t  vx,
   5317                                              pixman_fixed_t  unit_x,
   5318                                              pixman_fixed_t  src_width_fixed,
   5319                                              pixman_bool_t   fully_transparent_src)
   5320 {
   5321     uint32_t s, d;
   5322     const uint32_t* pm = NULL;
   5323 
   5324     __m128i xmm_dst_lo, xmm_dst_hi;
   5325     __m128i xmm_src_lo, xmm_src_hi;
   5326     __m128i xmm_alpha_lo, xmm_alpha_hi;
   5327 
   5328     if (fully_transparent_src)
   5329 	return;
   5330 
   5331     /* Align dst on a 16-byte boundary */
   5332     while (w && ((uintptr_t)pd & 15))
   5333     {
   5334 	d = *pd;
   5335 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
   5336 	vx += unit_x;
   5337 	while (vx >= 0)
   5338 	    vx -= src_width_fixed;
   5339 
   5340 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
   5341 	if (pm)
   5342 	    pm++;
   5343 	w--;
   5344     }
   5345 
   5346     while (w >= 4)
   5347     {
   5348 	__m128i tmp;
   5349 	uint32_t tmp1, tmp2, tmp3, tmp4;
   5350 
   5351 	tmp1 = *(ps + pixman_fixed_to_int (vx));
   5352 	vx += unit_x;
   5353 	while (vx >= 0)
   5354 	    vx -= src_width_fixed;
   5355 	tmp2 = *(ps + pixman_fixed_to_int (vx));
   5356 	vx += unit_x;
   5357 	while (vx >= 0)
   5358 	    vx -= src_width_fixed;
   5359 	tmp3 = *(ps + pixman_fixed_to_int (vx));
   5360 	vx += unit_x;
   5361 	while (vx >= 0)
   5362 	    vx -= src_width_fixed;
   5363 	tmp4 = *(ps + pixman_fixed_to_int (vx));
   5364 	vx += unit_x;
   5365 	while (vx >= 0)
   5366 	    vx -= src_width_fixed;
   5367 
   5368 	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
   5369 
   5370 	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
   5371 
   5372 	if (is_opaque (xmm_src_hi))
   5373 	{
   5374 	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
   5375 	}
   5376 	else if (!is_zero (xmm_src_hi))
   5377 	{
   5378 	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   5379 
   5380 	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   5381 	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   5382 
   5383 	    expand_alpha_2x128 (
   5384 		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
   5385 
   5386 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
   5387 			&xmm_alpha_lo, &xmm_alpha_hi,
   5388 			&xmm_dst_lo, &xmm_dst_hi);
   5389 
   5390 	    /* rebuid the 4 pixel data and save*/
   5391 	    save_128_aligned ((__m128i*)pd,
   5392 			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5393 	}
   5394 
   5395 	w -= 4;
   5396 	pd += 4;
   5397 	if (pm)
   5398 	    pm += 4;
   5399     }
   5400 
   5401     while (w)
   5402     {
   5403 	d = *pd;
   5404 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
   5405 	vx += unit_x;
   5406 	while (vx >= 0)
   5407 	    vx -= src_width_fixed;
   5408 
   5409 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
   5410 	if (pm)
   5411 	    pm++;
   5412 
   5413 	w--;
   5414     }
   5415 }
   5416 
   5417 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
   5418 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
   5419 		       uint32_t, uint32_t, COVER)
   5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
   5421 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
   5422 		       uint32_t, uint32_t, NONE)
   5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
   5424 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
   5425 		       uint32_t, uint32_t, PAD)
   5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
   5427 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
   5428 		       uint32_t, uint32_t, NORMAL)
   5429 
   5430 static force_inline void
   5431 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
   5432 					       uint32_t *       dst,
   5433 					       const uint32_t * src,
   5434 					       int32_t          w,
   5435 					       pixman_fixed_t   vx,
   5436 					       pixman_fixed_t   unit_x,
   5437 					       pixman_fixed_t   src_width_fixed,
   5438 					       pixman_bool_t    zero_src)
   5439 {
   5440     __m128i xmm_mask;
   5441     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   5442     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   5443     __m128i xmm_alpha_lo, xmm_alpha_hi;
   5444 
   5445     if (zero_src || (*mask >> 24) == 0)
   5446 	return;
   5447 
   5448     xmm_mask = create_mask_16_128 (*mask >> 24);
   5449 
   5450     while (w && (uintptr_t)dst & 15)
   5451     {
   5452 	uint32_t s = *(src + pixman_fixed_to_int (vx));
   5453 	vx += unit_x;
   5454 	while (vx >= 0)
   5455 	    vx -= src_width_fixed;
   5456 
   5457 	if (s)
   5458 	{
   5459 	    uint32_t d = *dst;
   5460 
   5461 	    __m128i ms = unpack_32_1x128 (s);
   5462 	    __m128i alpha     = expand_alpha_1x128 (ms);
   5463 	    __m128i dest      = xmm_mask;
   5464 	    __m128i alpha_dst = unpack_32_1x128 (d);
   5465 
   5466 	    *dst = pack_1x128_32 (
   5467 		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
   5468 	}
   5469 	dst++;
   5470 	w--;
   5471     }
   5472 
   5473     while (w >= 4)
   5474     {
   5475 	uint32_t tmp1, tmp2, tmp3, tmp4;
   5476 
   5477 	tmp1 = *(src + pixman_fixed_to_int (vx));
   5478 	vx += unit_x;
   5479 	while (vx >= 0)
   5480 	    vx -= src_width_fixed;
   5481 	tmp2 = *(src + pixman_fixed_to_int (vx));
   5482 	vx += unit_x;
   5483 	while (vx >= 0)
   5484 	    vx -= src_width_fixed;
   5485 	tmp3 = *(src + pixman_fixed_to_int (vx));
   5486 	vx += unit_x;
   5487 	while (vx >= 0)
   5488 	    vx -= src_width_fixed;
   5489 	tmp4 = *(src + pixman_fixed_to_int (vx));
   5490 	vx += unit_x;
   5491 	while (vx >= 0)
   5492 	    vx -= src_width_fixed;
   5493 
   5494 	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
   5495 
   5496 	if (!is_zero (xmm_src))
   5497 	{
   5498 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   5499 
   5500 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   5501 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5502 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   5503 			        &xmm_alpha_lo, &xmm_alpha_hi);
   5504 
   5505 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   5506 			   &xmm_alpha_lo, &xmm_alpha_hi,
   5507 			   &xmm_mask, &xmm_mask,
   5508 			   &xmm_dst_lo, &xmm_dst_hi);
   5509 
   5510 	    save_128_aligned (
   5511 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5512 	}
   5513 
   5514 	dst += 4;
   5515 	w -= 4;
   5516     }
   5517 
   5518     while (w)
   5519     {
   5520 	uint32_t s = *(src + pixman_fixed_to_int (vx));
   5521 	vx += unit_x;
   5522 	while (vx >= 0)
   5523 	    vx -= src_width_fixed;
   5524 
   5525 	if (s)
   5526 	{
   5527 	    uint32_t d = *dst;
   5528 
   5529 	    __m128i ms = unpack_32_1x128 (s);
   5530 	    __m128i alpha = expand_alpha_1x128 (ms);
   5531 	    __m128i mask  = xmm_mask;
   5532 	    __m128i dest  = unpack_32_1x128 (d);
   5533 
   5534 	    *dst = pack_1x128_32 (
   5535 		in_over_1x128 (&ms, &alpha, &mask, &dest));
   5536 	}
   5537 
   5538 	dst++;
   5539 	w--;
   5540     }
   5541 
   5542 }
   5543 
   5544 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
   5545 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
   5546 			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
   5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
   5548 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
   5549 			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
   5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
   5551 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
   5552 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
   5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
   5554 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
   5555 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
   5556 
   5557 #if BILINEAR_INTERPOLATION_BITS < 8
   5558 # define BILINEAR_DECLARE_VARIABLES						\
   5559     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
   5560     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
   5561     const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
   5562     const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
   5563 					  unit_x, -unit_x, unit_x, -unit_x);	\
   5564     const __m128i xmm_zero = _mm_setzero_si128 ();				\
   5565     __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
   5566 				   vx, -(vx + 1), vx, -(vx + 1))
   5567 #else
   5568 # define BILINEAR_DECLARE_VARIABLES						\
   5569     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
   5570     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
   5571     const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
   5572     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
   5573 					  -unit_x, -unit_x, -unit_x, -unit_x);	\
   5574     const __m128i xmm_zero = _mm_setzero_si128 ();				\
   5575     __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx,				\
   5576 				   -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
   5577 #endif
   5578 
   5579 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
   5580 do {										\
   5581     __m128i xmm_wh, xmm_lo, xmm_hi, a;						\
   5582     /* fetch 2x2 pixel block into sse2 registers */				\
   5583     __m128i tltr = _mm_loadl_epi64 (						\
   5584 			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\
   5585     __m128i blbr = _mm_loadl_epi64 (						\
   5586 			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\
   5587     vx += unit_x;								\
   5588     /* vertical interpolation */						\
   5589     a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\
   5590 					xmm_wt),				\
   5591 		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
   5592 					xmm_wb));				\
   5593     if (BILINEAR_INTERPOLATION_BITS < 8)					\
   5594     {										\
   5595 	/* calculate horizontal weights */					\
   5596 	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
   5597 					16 - BILINEAR_INTERPOLATION_BITS));	\
   5598 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
   5599 	/* horizontal interpolation */						\
   5600 	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
   5601 		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\
   5602     }										\
   5603     else									\
   5604     {										\
   5605 	/* calculate horizontal weights */					\
   5606 	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
   5607 					16 - BILINEAR_INTERPOLATION_BITS));	\
   5608 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
   5609 	/* horizontal interpolation */						\
   5610 	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
   5611 	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
   5612 	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
   5613 			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
   5614     }										\
   5615     /* shift and pack the result */						\
   5616     a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\
   5617     a = _mm_packs_epi32 (a, a);							\
   5618     a = _mm_packus_epi16 (a, a);						\
   5619     pix = _mm_cvtsi128_si32 (a);						\
   5620 } while (0)
   5621 
   5622 #define BILINEAR_SKIP_ONE_PIXEL()						\
   5623 do {										\
   5624     vx += unit_x;								\
   5625     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
   5626 } while(0)
   5627 
   5628 static force_inline void
   5629 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
   5630 					     const uint32_t * mask,
   5631 					     const uint32_t * src_top,
   5632 					     const uint32_t * src_bottom,
   5633 					     int32_t          w,
   5634 					     int              wt,
   5635 					     int              wb,
   5636 					     pixman_fixed_t   vx,
   5637 					     pixman_fixed_t   unit_x,
   5638 					     pixman_fixed_t   max_vx,
   5639 					     pixman_bool_t    zero_src)
   5640 {
   5641     BILINEAR_DECLARE_VARIABLES;
   5642     uint32_t pix1, pix2, pix3, pix4;
   5643 
   5644     while ((w -= 4) >= 0)
   5645     {
   5646 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5647 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
   5648 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
   5649 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
   5650 	*dst++ = pix1;
   5651 	*dst++ = pix2;
   5652 	*dst++ = pix3;
   5653 	*dst++ = pix4;
   5654     }
   5655 
   5656     if (w & 2)
   5657     {
   5658 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5659 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
   5660 	*dst++ = pix1;
   5661 	*dst++ = pix2;
   5662     }
   5663 
   5664     if (w & 1)
   5665     {
   5666 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5667 	*dst = pix1;
   5668     }
   5669 
   5670 }
   5671 
   5672 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
   5673 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   5674 			       uint32_t, uint32_t, uint32_t,
   5675 			       COVER, FLAG_NONE)
   5676 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
   5677 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   5678 			       uint32_t, uint32_t, uint32_t,
   5679 			       PAD, FLAG_NONE)
   5680 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
   5681 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   5682 			       uint32_t, uint32_t, uint32_t,
   5683 			       NONE, FLAG_NONE)
   5684 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
   5685 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   5686 			       uint32_t, uint32_t, uint32_t,
   5687 			       NORMAL, FLAG_NONE)
   5688 
   5689 static force_inline void
   5690 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
   5691 					      const uint32_t * mask,
   5692 					      const uint32_t * src_top,
   5693 					      const uint32_t * src_bottom,
   5694 					      int32_t          w,
   5695 					      int              wt,
   5696 					      int              wb,
   5697 					      pixman_fixed_t   vx,
   5698 					      pixman_fixed_t   unit_x,
   5699 					      pixman_fixed_t   max_vx,
   5700 					      pixman_bool_t    zero_src)
   5701 {
   5702     BILINEAR_DECLARE_VARIABLES;
   5703     uint32_t pix1, pix2, pix3, pix4;
   5704 
   5705     while (w && ((uintptr_t)dst & 15))
   5706     {
   5707 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5708 
   5709 	if (pix1)
   5710 	{
   5711 	    pix2 = *dst;
   5712 	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
   5713 	}
   5714 
   5715 	w--;
   5716 	dst++;
   5717     }
   5718 
   5719     while (w  >= 4)
   5720     {
   5721 	__m128i xmm_src;
   5722 	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
   5723 	__m128i xmm_alpha_hi, xmm_alpha_lo;
   5724 
   5725 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5726 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
   5727 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
   5728 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
   5729 
   5730 	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
   5731 
   5732 	if (!is_zero (xmm_src))
   5733 	{
   5734 	    if (is_opaque (xmm_src))
   5735 	    {
   5736 		save_128_aligned ((__m128i *)dst, xmm_src);
   5737 	    }
   5738 	    else
   5739 	    {
   5740 		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
   5741 
   5742 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   5743 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5744 
   5745 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
   5746 		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
   5747 			    &xmm_dst_lo, &xmm_dst_hi);
   5748 
   5749 		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5750 	    }
   5751 	}
   5752 
   5753 	w -= 4;
   5754 	dst += 4;
   5755     }
   5756 
   5757     while (w)
   5758     {
   5759 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5760 
   5761 	if (pix1)
   5762 	{
   5763 	    pix2 = *dst;
   5764 	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
   5765 	}
   5766 
   5767 	w--;
   5768 	dst++;
   5769     }
   5770 }
   5771 
   5772 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
   5773 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   5774 			       uint32_t, uint32_t, uint32_t,
   5775 			       COVER, FLAG_NONE)
   5776 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
   5777 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   5778 			       uint32_t, uint32_t, uint32_t,
   5779 			       PAD, FLAG_NONE)
   5780 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
   5781 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   5782 			       uint32_t, uint32_t, uint32_t,
   5783 			       NONE, FLAG_NONE)
   5784 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
   5785 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   5786 			       uint32_t, uint32_t, uint32_t,
   5787 			       NORMAL, FLAG_NONE)
   5788 
   5789 static force_inline void
   5790 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
   5791 						const uint8_t  * mask,
   5792 						const uint32_t * src_top,
   5793 						const uint32_t * src_bottom,
   5794 						int32_t          w,
   5795 						int              wt,
   5796 						int              wb,
   5797 						pixman_fixed_t   vx,
   5798 						pixman_fixed_t   unit_x,
   5799 						pixman_fixed_t   max_vx,
   5800 						pixman_bool_t    zero_src)
   5801 {
   5802     BILINEAR_DECLARE_VARIABLES;
   5803     uint32_t pix1, pix2, pix3, pix4;
   5804     uint32_t m;
   5805 
   5806     while (w && ((uintptr_t)dst & 15))
   5807     {
   5808 	uint32_t sa;
   5809 
   5810 	m = (uint32_t) *mask++;
   5811 
   5812 	if (m)
   5813 	{
   5814 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5815 	    sa = pix1 >> 24;
   5816 
   5817 	    if (sa == 0xff && m == 0xff)
   5818 	    {
   5819 		*dst = pix1;
   5820 	    }
   5821 	    else
   5822 	    {
   5823 		__m128i ms, md, ma, msa;
   5824 
   5825 		pix2 = *dst;
   5826 		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   5827 		ms = unpack_32_1x128 (pix1);
   5828 		md = unpack_32_1x128 (pix2);
   5829 
   5830 		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   5831 
   5832 		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   5833 	    }
   5834 	}
   5835 	else
   5836 	{
   5837 	    BILINEAR_SKIP_ONE_PIXEL ();
   5838 	}
   5839 
   5840 	w--;
   5841 	dst++;
   5842     }
   5843 
   5844     while (w >= 4)
   5845     {
   5846 	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
   5847 	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   5848 	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   5849 
   5850 	m = *(uint32_t*)mask;
   5851 
   5852 	if (m)
   5853 	{
   5854 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5855 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
   5856 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
   5857 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
   5858 
   5859 	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
   5860 
   5861 	    if (m == 0xffffffff && is_opaque (xmm_src))
   5862 	    {
   5863 		save_128_aligned ((__m128i *)dst, xmm_src);
   5864 	    }
   5865 	    else
   5866 	    {
   5867 		xmm_dst = load_128_aligned ((__m128i *)dst);
   5868 
   5869 		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
   5870 
   5871 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   5872 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   5873 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5874 
   5875 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
   5876 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   5877 
   5878 		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
   5879 			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
   5880 
   5881 		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5882 	    }
   5883 	}
   5884 	else
   5885 	{
   5886 	    BILINEAR_SKIP_ONE_PIXEL ();
   5887 	    BILINEAR_SKIP_ONE_PIXEL ();
   5888 	    BILINEAR_SKIP_ONE_PIXEL ();
   5889 	    BILINEAR_SKIP_ONE_PIXEL ();
   5890 	}
   5891 
   5892 	w -= 4;
   5893 	dst += 4;
   5894 	mask += 4;
   5895     }
   5896 
   5897     while (w)
   5898     {
   5899 	uint32_t sa;
   5900 
   5901 	m = (uint32_t) *mask++;
   5902 
   5903 	if (m)
   5904 	{
   5905 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5906 	    sa = pix1 >> 24;
   5907 
   5908 	    if (sa == 0xff && m == 0xff)
   5909 	    {
   5910 		*dst = pix1;
   5911 	    }
   5912 	    else
   5913 	    {
   5914 		__m128i ms, md, ma, msa;
   5915 
   5916 		pix2 = *dst;
   5917 		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   5918 		ms = unpack_32_1x128 (pix1);
   5919 		md = unpack_32_1x128 (pix2);
   5920 
   5921 		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   5922 
   5923 		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   5924 	    }
   5925 	}
   5926 	else
   5927 	{
   5928 	    BILINEAR_SKIP_ONE_PIXEL ();
   5929 	}
   5930 
   5931 	w--;
   5932 	dst++;
   5933     }
   5934 }
   5935 
   5936 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
   5937 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   5938 			       uint32_t, uint8_t, uint32_t,
   5939 			       COVER, FLAG_HAVE_NON_SOLID_MASK)
   5940 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
   5941 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   5942 			       uint32_t, uint8_t, uint32_t,
   5943 			       PAD, FLAG_HAVE_NON_SOLID_MASK)
   5944 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
   5945 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   5946 			       uint32_t, uint8_t, uint32_t,
   5947 			       NONE, FLAG_HAVE_NON_SOLID_MASK)
   5948 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
   5949 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   5950 			       uint32_t, uint8_t, uint32_t,
   5951 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
   5952 
   5953 static force_inline void
   5954 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
   5955 						const uint32_t * mask,
   5956 						const uint32_t * src_top,
   5957 						const uint32_t * src_bottom,
   5958 						int32_t          w,
   5959 						int              wt,
   5960 						int              wb,
   5961 						pixman_fixed_t   vx,
   5962 						pixman_fixed_t   unit_x,
   5963 						pixman_fixed_t   max_vx,
   5964 						pixman_bool_t    zero_src)
   5965 {
   5966     BILINEAR_DECLARE_VARIABLES;
   5967     uint32_t pix1, pix2, pix3, pix4;
   5968     __m128i xmm_mask;
   5969 
   5970     if (zero_src || (*mask >> 24) == 0)
   5971 	return;
   5972 
   5973     xmm_mask = create_mask_16_128 (*mask >> 24);
   5974 
   5975     while (w && ((uintptr_t)dst & 15))
   5976     {
   5977 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5978 	if (pix1)
   5979 	{
   5980 		uint32_t d = *dst;
   5981 
   5982 		__m128i ms = unpack_32_1x128 (pix1);
   5983 		__m128i alpha     = expand_alpha_1x128 (ms);
   5984 		__m128i dest      = xmm_mask;
   5985 		__m128i alpha_dst = unpack_32_1x128 (d);
   5986 
   5987 		*dst = pack_1x128_32
   5988 			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
   5989 	}
   5990 
   5991 	dst++;
   5992 	w--;
   5993     }
   5994 
   5995     while (w >= 4)
   5996     {
   5997 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5998 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
   5999 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
   6000 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
   6001 
   6002 	if (pix1 | pix2 | pix3 | pix4)
   6003 	{
   6004 	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   6005 	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   6006 	    __m128i xmm_alpha_lo, xmm_alpha_hi;
   6007 
   6008 	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
   6009 
   6010 	    xmm_dst = load_128_aligned ((__m128i*)dst);
   6011 
   6012 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   6013 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   6014 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   6015 				&xmm_alpha_lo, &xmm_alpha_hi);
   6016 
   6017 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   6018 			   &xmm_alpha_lo, &xmm_alpha_hi,
   6019 			   &xmm_mask, &xmm_mask,
   6020 			   &xmm_dst_lo, &xmm_dst_hi);
   6021 
   6022 	    save_128_aligned
   6023 		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   6024 	}
   6025 
   6026 	dst += 4;
   6027 	w -= 4;
   6028     }
   6029 
   6030     while (w)
   6031     {
   6032 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   6033 	if (pix1)
   6034 	{
   6035 		uint32_t d = *dst;
   6036 
   6037 		__m128i ms = unpack_32_1x128 (pix1);
   6038 		__m128i alpha     = expand_alpha_1x128 (ms);
   6039 		__m128i dest      = xmm_mask;
   6040 		__m128i alpha_dst = unpack_32_1x128 (d);
   6041 
   6042 		*dst = pack_1x128_32
   6043 			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
   6044 	}
   6045 
   6046 	dst++;
   6047 	w--;
   6048     }
   6049 }
   6050 
   6051 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
   6052 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
   6053 			       uint32_t, uint32_t, uint32_t,
   6054 			       COVER, FLAG_HAVE_SOLID_MASK)
   6055 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
   6056 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
   6057 			       uint32_t, uint32_t, uint32_t,
   6058 			       PAD, FLAG_HAVE_SOLID_MASK)
   6059 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
   6060 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
   6061 			       uint32_t, uint32_t, uint32_t,
   6062 			       NONE, FLAG_HAVE_SOLID_MASK)
   6063 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
   6064 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
   6065 			       uint32_t, uint32_t, uint32_t,
   6066 			       NORMAL, FLAG_HAVE_SOLID_MASK)
   6067 
   6068 static const pixman_fast_path_t sse2_fast_paths[] =
   6069 {
   6070     /* PIXMAN_OP_OVER */
   6071     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
   6072     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
   6073     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
   6074     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
   6075     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
   6076     PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
   6077     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
   6078     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
   6079     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
   6080     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
   6081     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
   6082     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
   6083     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
   6084     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
   6085     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
   6086     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
   6087     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
   6088     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
   6089     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
   6090     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
   6091     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
   6092     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
   6093     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
   6094     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
   6095     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
   6096     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
   6097     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
   6098     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
   6099     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
   6100     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
   6101     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
   6102     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
   6103     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
   6104     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
   6105     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
   6106     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
   6107     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
   6108     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
   6109     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
   6110     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
   6111     PIXMAN_STD_FAST_PATH<