Home | History | Annotate | Download | only in pixman
      1 /*
      2  * Copyright  2004, 2005 Red Hat, Inc.
      3  * Copyright  2004 Nicholas Miell
      4  * Copyright  2005 Trolltech AS
      5  *
      6  * Permission to use, copy, modify, distribute, and sell this software and its
      7  * documentation for any purpose is hereby granted without fee, provided that
      8  * the above copyright notice appear in all copies and that both that
      9  * copyright notice and this permission notice appear in supporting
     10  * documentation, and that the name of Red Hat not be used in advertising or
     11  * publicity pertaining to distribution of the software without specific,
     12  * written prior permission.  Red Hat makes no representations about the
     13  * suitability of this software for any purpose.  It is provided "as is"
     14  * without express or implied warranty.
     15  *
     16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     23  * SOFTWARE.
     24  *
     25  * Author:  Sren Sandmann (sandmann (at) redhat.com)
     26  * Minor Improvements: Nicholas Miell (nmiell (at) gmail.com)
     27  * MMX code paths for fbcompose.c by Lars Knoll (lars (at) trolltech.com)
     28  *
     29  * Based on work by Owen Taylor
     30  */
     31 
     32 #ifdef HAVE_CONFIG_H
     33 #include <config.h>
     34 #endif
     35 
     36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
     37 
     38 #ifdef USE_LOONGSON_MMI
     39 #include <loongson-mmintrin.h>
     40 #else
     41 #include <mmintrin.h>
     42 #endif
     43 #include "pixman-private.h"
     44 #include "pixman-combine32.h"
     45 #include "pixman-inlines.h"
     46 
     47 #ifdef VERBOSE
     48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
     49 #else
     50 #define CHECKPOINT()
     51 #endif
     52 
     53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
     54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
     55 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     56 _mm_empty (void)
     57 {
     58 
     59 }
     60 #endif
     61 
     62 #ifdef USE_X86_MMX
     63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
     64 #  include <xmmintrin.h>
     65 # else
     66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
     67  * instructions to be generated that we don't want. Just duplicate the
     68  * functions we want to use.  */
     69 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     70 _mm_movemask_pi8 (__m64 __A)
     71 {
     72     int ret;
     73 
     74     asm ("pmovmskb %1, %0\n\t"
     75 	: "=r" (ret)
     76 	: "y" (__A)
     77     );
     78 
     79     return ret;
     80 }
     81 
     82 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     83 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
     84 {
     85     asm ("pmulhuw %1, %0\n\t"
     86 	: "+y" (__A)
     87 	: "y" (__B)
     88     );
     89     return __A;
     90 }
     91 
     92 #  ifdef __OPTIMIZE__
     93 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     94 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
     95 {
     96     __m64 ret;
     97 
     98     asm ("pshufw %2, %1, %0\n\t"
     99 	: "=y" (ret)
    100 	: "y" (__A), "K" (__N)
    101     );
    102 
    103     return ret;
    104 }
    105 #  else
    106 #   define _mm_shuffle_pi16(A, N)					\
    107     ({									\
    108 	__m64 ret;							\
    109 									\
    110 	asm ("pshufw %2, %1, %0\n\t"					\
    111 	     : "=y" (ret)						\
    112 	     : "y" (A), "K" ((const int8_t)N)				\
    113 	);								\
    114 									\
    115 	ret;								\
    116     })
    117 #  endif
    118 # endif
    119 #endif
    120 
    121 #ifndef _MSC_VER
    122 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
    123  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
    124 #endif
    125 
    126 /* Notes about writing mmx code
    127  *
    128  * give memory operands as the second operand. If you give it as the
    129  * first, gcc will first load it into a register, then use that
    130  * register
    131  *
    132  *   ie. use
    133  *
    134  *         _mm_mullo_pi16 (x, mmx_constant);
    135  *
    136  *   not
    137  *
    138  *         _mm_mullo_pi16 (mmx_constant, x);
    139  *
    140  * Also try to minimize dependencies. i.e. when you need a value, try
    141  * to calculate it from a value that was calculated as early as
    142  * possible.
    143  */
    144 
    145 /* --------------- MMX primitives ------------------------------------- */
    146 
    147 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
    148  * the name of the member used to access the data.
    149  * If __m64 requires using mm_cvt* intrinsics functions to convert between
    150  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
    151  * If __m64 and uint64_t values can just be cast to each other directly,
    152  * then define USE_M64_CASTS.
    153  * If __m64 is a double datatype, then define USE_M64_DOUBLE.
    154  */
    155 #ifdef _MSC_VER
    156 # define M64_MEMBER m64_u64
    157 #elif defined(__ICC)
    158 # define USE_CVT_INTRINSICS
    159 #elif defined(USE_LOONGSON_MMI)
    160 # define USE_M64_DOUBLE
    161 #elif defined(__GNUC__)
    162 # define USE_M64_CASTS
    163 #elif defined(__SUNPRO_C)
    164 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
    165 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
    166  * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
    167  * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
    168  */
    169 #  define USE_CVT_INTRINSICS
    170 # else
    171 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
    172  * disabled, __m64 is defined as a struct containing "unsigned long long l_".
    173  */
    174 #  define M64_MEMBER l_
    175 # endif
    176 #endif
    177 
    178 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
    179 typedef uint64_t mmxdatafield;
    180 #else
    181 typedef __m64 mmxdatafield;
    182 #endif
    183 
    184 typedef struct
    185 {
    186     mmxdatafield mmx_4x00ff;
    187     mmxdatafield mmx_4x0080;
    188     mmxdatafield mmx_565_rgb;
    189     mmxdatafield mmx_565_unpack_multiplier;
    190     mmxdatafield mmx_565_pack_multiplier;
    191     mmxdatafield mmx_565_r;
    192     mmxdatafield mmx_565_g;
    193     mmxdatafield mmx_565_b;
    194     mmxdatafield mmx_packed_565_rb;
    195     mmxdatafield mmx_packed_565_g;
    196     mmxdatafield mmx_expand_565_g;
    197     mmxdatafield mmx_expand_565_b;
    198     mmxdatafield mmx_expand_565_r;
    199 #ifndef USE_LOONGSON_MMI
    200     mmxdatafield mmx_mask_0;
    201     mmxdatafield mmx_mask_1;
    202     mmxdatafield mmx_mask_2;
    203     mmxdatafield mmx_mask_3;
    204 #endif
    205     mmxdatafield mmx_full_alpha;
    206     mmxdatafield mmx_4x0101;
    207     mmxdatafield mmx_ff000000;
    208 } mmx_data_t;
    209 
    210 #if defined(_MSC_VER)
    211 # define MMXDATA_INIT(field, val) { val ## UI64 }
    212 #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
    213 # define MMXDATA_INIT(field, val) field =   { val ## ULL }
    214 #else                           /* mmxdatafield is an integral type */
    215 # define MMXDATA_INIT(field, val) field =   val ## ULL
    216 #endif
    217 
    218 static const mmx_data_t c =
    219 {
    220     MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
    221     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
    222     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
    223     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
    224     MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
    225     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
    226     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
    227     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
    228     MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
    229     MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
    230     MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
    231     MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
    232     MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
    233 #ifndef USE_LOONGSON_MMI
    234     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
    235     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
    236     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
    237     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
    238 #endif
    239     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
    240     MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
    241     MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
    242 };
    243 
    244 #ifdef USE_CVT_INTRINSICS
    245 #    define MC(x) to_m64 (c.mmx_ ## x)
    246 #elif defined(USE_M64_CASTS)
    247 #    define MC(x) ((__m64)c.mmx_ ## x)
    248 #elif defined(USE_M64_DOUBLE)
    249 #    define MC(x) (*(__m64 *)&c.mmx_ ## x)
    250 #else
    251 #    define MC(x) c.mmx_ ## x
    252 #endif
    253 
    254 static force_inline __m64
    255 to_m64 (uint64_t x)
    256 {
    257 #ifdef USE_CVT_INTRINSICS
    258     return _mm_cvtsi64_m64 (x);
    259 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
    260     __m64 res;
    261 
    262     res.M64_MEMBER = x;
    263     return res;
    264 #elif defined USE_M64_DOUBLE
    265     return *(__m64 *)&x;
    266 #else /* USE_M64_CASTS */
    267     return (__m64)x;
    268 #endif
    269 }
    270 
    271 static force_inline uint64_t
    272 to_uint64 (__m64 x)
    273 {
    274 #ifdef USE_CVT_INTRINSICS
    275     return _mm_cvtm64_si64 (x);
    276 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
    277     uint64_t res = x.M64_MEMBER;
    278     return res;
    279 #elif defined USE_M64_DOUBLE
    280     return *(uint64_t *)&x;
    281 #else /* USE_M64_CASTS */
    282     return (uint64_t)x;
    283 #endif
    284 }
    285 
    286 static force_inline __m64
    287 shift (__m64 v,
    288        int   s)
    289 {
    290     if (s > 0)
    291 	return _mm_slli_si64 (v, s);
    292     else if (s < 0)
    293 	return _mm_srli_si64 (v, -s);
    294     else
    295 	return v;
    296 }
    297 
    298 static force_inline __m64
    299 negate (__m64 mask)
    300 {
    301     return _mm_xor_si64 (mask, MC (4x00ff));
    302 }
    303 
    304 static force_inline __m64
    305 pix_multiply (__m64 a, __m64 b)
    306 {
    307     __m64 res;
    308 
    309     res = _mm_mullo_pi16 (a, b);
    310     res = _mm_adds_pu16 (res, MC (4x0080));
    311     res = _mm_mulhi_pu16 (res, MC (4x0101));
    312 
    313     return res;
    314 }
    315 
    316 static force_inline __m64
    317 pix_add (__m64 a, __m64 b)
    318 {
    319     return _mm_adds_pu8 (a, b);
    320 }
    321 
    322 static force_inline __m64
    323 expand_alpha (__m64 pixel)
    324 {
    325     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
    326 }
    327 
    328 static force_inline __m64
    329 expand_alpha_rev (__m64 pixel)
    330 {
    331     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
    332 }
    333 
    334 static force_inline __m64
    335 invert_colors (__m64 pixel)
    336 {
    337     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
    338 }
    339 
    340 static force_inline __m64
    341 over (__m64 src,
    342       __m64 srca,
    343       __m64 dest)
    344 {
    345     return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
    346 }
    347 
    348 static force_inline __m64
    349 over_rev_non_pre (__m64 src, __m64 dest)
    350 {
    351     __m64 srca = expand_alpha (src);
    352     __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
    353 
    354     return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
    355 }
    356 
    357 static force_inline __m64
    358 in (__m64 src, __m64 mask)
    359 {
    360     return pix_multiply (src, mask);
    361 }
    362 
    363 #ifndef _MSC_VER
    364 static force_inline __m64
    365 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
    366 {
    367     return over (in (src, mask), pix_multiply (srca, mask), dest);
    368 }
    369 
    370 #else
    371 
    372 #define in_over(src, srca, mask, dest)					\
    373     over (in (src, mask), pix_multiply (srca, mask), dest)
    374 
    375 #endif
    376 
    377 /* Elemental unaligned loads */
    378 
    379 static force_inline __m64 ldq_u(__m64 *p)
    380 {
    381 #ifdef USE_X86_MMX
    382     /* x86's alignment restrictions are very relaxed. */
    383     return *(__m64 *)p;
    384 #elif defined USE_ARM_IWMMXT
    385     int align = (uintptr_t)p & 7;
    386     __m64 *aligned_p;
    387     if (align == 0)
    388 	return *p;
    389     aligned_p = (__m64 *)((uintptr_t)p & ~7);
    390     return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
    391 #else
    392     struct __una_u64 { __m64 x __attribute__((packed)); };
    393     const struct __una_u64 *ptr = (const struct __una_u64 *) p;
    394     return (__m64) ptr->x;
    395 #endif
    396 }
    397 
    398 static force_inline uint32_t ldl_u(const uint32_t *p)
    399 {
    400 #ifdef USE_X86_MMX
    401     /* x86's alignment restrictions are very relaxed. */
    402     return *p;
    403 #else
    404     struct __una_u32 { uint32_t x __attribute__((packed)); };
    405     const struct __una_u32 *ptr = (const struct __una_u32 *) p;
    406     return ptr->x;
    407 #endif
    408 }
    409 
    410 static force_inline __m64
    411 load (const uint32_t *v)
    412 {
    413 #ifdef USE_LOONGSON_MMI
    414     __m64 ret;
    415     asm ("lwc1 %0, %1\n\t"
    416 	: "=f" (ret)
    417 	: "m" (*v)
    418     );
    419     return ret;
    420 #else
    421     return _mm_cvtsi32_si64 (*v);
    422 #endif
    423 }
    424 
    425 static force_inline __m64
    426 load8888 (const uint32_t *v)
    427 {
    428 #ifdef USE_LOONGSON_MMI
    429     return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
    430 #else
    431     return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
    432 #endif
    433 }
    434 
    435 static force_inline __m64
    436 load8888u (const uint32_t *v)
    437 {
    438     uint32_t l = ldl_u (v);
    439     return load8888 (&l);
    440 }
    441 
    442 static force_inline __m64
    443 pack8888 (__m64 lo, __m64 hi)
    444 {
    445     return _mm_packs_pu16 (lo, hi);
    446 }
    447 
    448 static force_inline void
    449 store (uint32_t *dest, __m64 v)
    450 {
    451 #ifdef USE_LOONGSON_MMI
    452     asm ("swc1 %1, %0\n\t"
    453 	: "=m" (*dest)
    454 	: "f" (v)
    455 	: "memory"
    456     );
    457 #else
    458     *dest = _mm_cvtsi64_si32 (v);
    459 #endif
    460 }
    461 
    462 static force_inline void
    463 store8888 (uint32_t *dest, __m64 v)
    464 {
    465     v = pack8888 (v, _mm_setzero_si64 ());
    466     store (dest, v);
    467 }
    468 
    469 static force_inline pixman_bool_t
    470 is_equal (__m64 a, __m64 b)
    471 {
    472 #ifdef USE_LOONGSON_MMI
    473     /* __m64 is double, we can compare directly. */
    474     return a == b;
    475 #else
    476     return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
    477 #endif
    478 }
    479 
    480 static force_inline pixman_bool_t
    481 is_opaque (__m64 v)
    482 {
    483 #ifdef USE_LOONGSON_MMI
    484     return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
    485 #else
    486     __m64 ffs = _mm_cmpeq_pi8 (v, v);
    487     return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
    488 #endif
    489 }
    490 
    491 static force_inline pixman_bool_t
    492 is_zero (__m64 v)
    493 {
    494     return is_equal (v, _mm_setzero_si64 ());
    495 }
    496 
    497 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
    498  *
    499  *    00RR00GG00BB
    500  *
    501  * --- Expanding 565 in the low word ---
    502  *
    503  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
    504  * m = m & (01f0003f001f);
    505  * m = m * (008404100840);
    506  * m = m >> 8;
    507  *
    508  * Note the trick here - the top word is shifted by another nibble to
    509  * avoid it bumping into the middle word
    510  */
    511 static force_inline __m64
    512 expand565 (__m64 pixel, int pos)
    513 {
    514     __m64 p = pixel;
    515     __m64 t1, t2;
    516 
    517     /* move pixel to low 16 bit and zero the rest */
    518 #ifdef USE_LOONGSON_MMI
    519     p = loongson_extract_pi16 (p, pos);
    520 #else
    521     p = shift (shift (p, (3 - pos) * 16), -48);
    522 #endif
    523 
    524     t1 = shift (p, 36 - 11);
    525     t2 = shift (p, 16 - 5);
    526 
    527     p = _mm_or_si64 (t1, p);
    528     p = _mm_or_si64 (t2, p);
    529     p = _mm_and_si64 (p, MC (565_rgb));
    530 
    531     pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
    532     return _mm_srli_pi16 (pixel, 8);
    533 }
    534 
    535 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
    536  *
    537  *    AARRGGBBRRGGBB
    538  */
    539 static force_inline void
    540 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
    541 {
    542     __m64 t0, t1, alpha = _mm_setzero_si64 ();
    543     __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
    544     __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
    545     __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
    546     if (full_alpha)
    547 	alpha = _mm_cmpeq_pi32 (alpha, alpha);
    548 
    549     /* Replicate high bits into empty low bits. */
    550     r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
    551     g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
    552     b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
    553 
    554     r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
    555     g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
    556     b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
    557 
    558     t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
    559     t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
    560 
    561     *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
    562     *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
    563 }
    564 
    565 static force_inline __m64
    566 expand8888 (__m64 in, int pos)
    567 {
    568     if (pos == 0)
    569 	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
    570     else
    571 	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
    572 }
    573 
    574 static force_inline __m64
    575 expandx888 (__m64 in, int pos)
    576 {
    577     return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
    578 }
    579 
    580 static force_inline void
    581 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
    582 {
    583     __m64 v0, v1;
    584     expand_4xpacked565 (vin, &v0, &v1, full_alpha);
    585     *vout0 = expand8888 (v0, 0);
    586     *vout1 = expand8888 (v0, 1);
    587     *vout2 = expand8888 (v1, 0);
    588     *vout3 = expand8888 (v1, 1);
    589 }
    590 
    591 static force_inline __m64
    592 pack_565 (__m64 pixel, __m64 target, int pos)
    593 {
    594     __m64 p = pixel;
    595     __m64 t = target;
    596     __m64 r, g, b;
    597 
    598     r = _mm_and_si64 (p, MC (565_r));
    599     g = _mm_and_si64 (p, MC (565_g));
    600     b = _mm_and_si64 (p, MC (565_b));
    601 
    602 #ifdef USE_LOONGSON_MMI
    603     r = shift (r, -(32 - 8));
    604     g = shift (g, -(16 - 3));
    605     b = shift (b, -(0  + 3));
    606 
    607     p = _mm_or_si64 (r, g);
    608     p = _mm_or_si64 (p, b);
    609     return loongson_insert_pi16 (t, p, pos);
    610 #else
    611     r = shift (r, -(32 - 8) + pos * 16);
    612     g = shift (g, -(16 - 3) + pos * 16);
    613     b = shift (b, -(0  + 3) + pos * 16);
    614 
    615     if (pos == 0)
    616 	t = _mm_and_si64 (t, MC (mask_0));
    617     else if (pos == 1)
    618 	t = _mm_and_si64 (t, MC (mask_1));
    619     else if (pos == 2)
    620 	t = _mm_and_si64 (t, MC (mask_2));
    621     else if (pos == 3)
    622 	t = _mm_and_si64 (t, MC (mask_3));
    623 
    624     p = _mm_or_si64 (r, t);
    625     p = _mm_or_si64 (g, p);
    626 
    627     return _mm_or_si64 (b, p);
    628 #endif
    629 }
    630 
    631 static force_inline __m64
    632 pack_4xpacked565 (__m64 a, __m64 b)
    633 {
    634     __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
    635     __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
    636 
    637     __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
    638     __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
    639 
    640     __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
    641     __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
    642 
    643     t0 = _mm_or_si64 (t0, g0);
    644     t1 = _mm_or_si64 (t1, g1);
    645 
    646     t0 = shift(t0, -5);
    647 #ifdef USE_ARM_IWMMXT
    648     t1 = shift(t1, -5);
    649     return _mm_packs_pu32 (t0, t1);
    650 #else
    651     t1 = shift(t1, -5 + 16);
    652     return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
    653 #endif
    654 }
    655 
    656 #ifndef _MSC_VER
    657 
    658 static force_inline __m64
    659 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
    660 {
    661     return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
    662 }
    663 
    664 static force_inline __m64
    665 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
    666 {
    667     x = pix_multiply (x, a);
    668     y = pix_multiply (y, b);
    669 
    670     return pix_add (x, y);
    671 }
    672 
    673 #else
    674 
    675 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
    676 
    677 #define pack_4x565(v0, v1, v2, v3) \
    678     pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
    679 
    680 #define pix_add_mul(x, a, y, b)	 \
    681     ( x = pix_multiply (x, a),	 \
    682       y = pix_multiply (y, b),	 \
    683       pix_add (x, y) )
    684 
    685 #endif
    686 
    687 /* --------------- MMX code patch for fbcompose.c --------------------- */
    688 
    689 static force_inline __m64
    690 combine (const uint32_t *src, const uint32_t *mask)
    691 {
    692     __m64 vsrc = load8888 (src);
    693 
    694     if (mask)
    695     {
    696 	__m64 m = load8888 (mask);
    697 
    698 	m = expand_alpha (m);
    699 	vsrc = pix_multiply (vsrc, m);
    700     }
    701 
    702     return vsrc;
    703 }
    704 
    705 static force_inline __m64
    706 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
    707 {
    708     vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
    709 
    710     if (is_opaque (vsrc))
    711     {
    712 	return vsrc;
    713     }
    714     else if (!is_zero (vsrc))
    715     {
    716 	return over (vsrc, expand_alpha (vsrc),
    717 		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
    718     }
    719 
    720     return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
    721 }
    722 
    723 static void
    724 mmx_combine_over_u (pixman_implementation_t *imp,
    725                     pixman_op_t              op,
    726                     uint32_t *               dest,
    727                     const uint32_t *         src,
    728                     const uint32_t *         mask,
    729                     int                      width)
    730 {
    731     const uint32_t *end = dest + width;
    732 
    733     while (dest < end)
    734     {
    735 	__m64 vsrc = combine (src, mask);
    736 
    737 	if (is_opaque (vsrc))
    738 	{
    739 	    store8888 (dest, vsrc);
    740 	}
    741 	else if (!is_zero (vsrc))
    742 	{
    743 	    __m64 sa = expand_alpha (vsrc);
    744 	    store8888 (dest, over (vsrc, sa, load8888 (dest)));
    745 	}
    746 
    747 	++dest;
    748 	++src;
    749 	if (mask)
    750 	    ++mask;
    751     }
    752     _mm_empty ();
    753 }
    754 
    755 static void
    756 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
    757                             pixman_op_t              op,
    758                             uint32_t *               dest,
    759                             const uint32_t *         src,
    760                             const uint32_t *         mask,
    761                             int                      width)
    762 {
    763     const uint32_t *end = dest + width;
    764 
    765     while (dest < end)
    766     {
    767 	__m64 d, da;
    768 	__m64 s = combine (src, mask);
    769 
    770 	d = load8888 (dest);
    771 	da = expand_alpha (d);
    772 	store8888 (dest, over (d, da, s));
    773 
    774 	++dest;
    775 	++src;
    776 	if (mask)
    777 	    mask++;
    778     }
    779     _mm_empty ();
    780 }
    781 
    782 static void
    783 mmx_combine_in_u (pixman_implementation_t *imp,
    784                   pixman_op_t              op,
    785                   uint32_t *               dest,
    786                   const uint32_t *         src,
    787                   const uint32_t *         mask,
    788                   int                      width)
    789 {
    790     const uint32_t *end = dest + width;
    791 
    792     while (dest < end)
    793     {
    794 	__m64 a;
    795 	__m64 x = combine (src, mask);
    796 
    797 	a = load8888 (dest);
    798 	a = expand_alpha (a);
    799 	x = pix_multiply (x, a);
    800 
    801 	store8888 (dest, x);
    802 
    803 	++dest;
    804 	++src;
    805 	if (mask)
    806 	    mask++;
    807     }
    808     _mm_empty ();
    809 }
    810 
    811 static void
    812 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
    813                           pixman_op_t              op,
    814                           uint32_t *               dest,
    815                           const uint32_t *         src,
    816                           const uint32_t *         mask,
    817                           int                      width)
    818 {
    819     const uint32_t *end = dest + width;
    820 
    821     while (dest < end)
    822     {
    823 	__m64 a = combine (src, mask);
    824 	__m64 x;
    825 
    826 	x = load8888 (dest);
    827 	a = expand_alpha (a);
    828 	x = pix_multiply (x, a);
    829 	store8888 (dest, x);
    830 
    831 	++dest;
    832 	++src;
    833 	if (mask)
    834 	    mask++;
    835     }
    836     _mm_empty ();
    837 }
    838 
    839 static void
    840 mmx_combine_out_u (pixman_implementation_t *imp,
    841                    pixman_op_t              op,
    842                    uint32_t *               dest,
    843                    const uint32_t *         src,
    844                    const uint32_t *         mask,
    845                    int                      width)
    846 {
    847     const uint32_t *end = dest + width;
    848 
    849     while (dest < end)
    850     {
    851 	__m64 a;
    852 	__m64 x = combine (src, mask);
    853 
    854 	a = load8888 (dest);
    855 	a = expand_alpha (a);
    856 	a = negate (a);
    857 	x = pix_multiply (x, a);
    858 	store8888 (dest, x);
    859 
    860 	++dest;
    861 	++src;
    862 	if (mask)
    863 	    mask++;
    864     }
    865     _mm_empty ();
    866 }
    867 
    868 static void
    869 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
    870                            pixman_op_t              op,
    871                            uint32_t *               dest,
    872                            const uint32_t *         src,
    873                            const uint32_t *         mask,
    874                            int                      width)
    875 {
    876     const uint32_t *end = dest + width;
    877 
    878     while (dest < end)
    879     {
    880 	__m64 a = combine (src, mask);
    881 	__m64 x;
    882 
    883 	x = load8888 (dest);
    884 	a = expand_alpha (a);
    885 	a = negate (a);
    886 	x = pix_multiply (x, a);
    887 
    888 	store8888 (dest, x);
    889 
    890 	++dest;
    891 	++src;
    892 	if (mask)
    893 	    mask++;
    894     }
    895     _mm_empty ();
    896 }
    897 
    898 static void
    899 mmx_combine_atop_u (pixman_implementation_t *imp,
    900                     pixman_op_t              op,
    901                     uint32_t *               dest,
    902                     const uint32_t *         src,
    903                     const uint32_t *         mask,
    904                     int                      width)
    905 {
    906     const uint32_t *end = dest + width;
    907 
    908     while (dest < end)
    909     {
    910 	__m64 da, d, sia;
    911 	__m64 s = combine (src, mask);
    912 
    913 	d = load8888 (dest);
    914 	sia = expand_alpha (s);
    915 	sia = negate (sia);
    916 	da = expand_alpha (d);
    917 	s = pix_add_mul (s, da, d, sia);
    918 	store8888 (dest, s);
    919 
    920 	++dest;
    921 	++src;
    922 	if (mask)
    923 	    mask++;
    924     }
    925     _mm_empty ();
    926 }
    927 
    928 static void
    929 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
    930                             pixman_op_t              op,
    931                             uint32_t *               dest,
    932                             const uint32_t *         src,
    933                             const uint32_t *         mask,
    934                             int                      width)
    935 {
    936     const uint32_t *end;
    937 
    938     end = dest + width;
    939 
    940     while (dest < end)
    941     {
    942 	__m64 dia, d, sa;
    943 	__m64 s = combine (src, mask);
    944 
    945 	d = load8888 (dest);
    946 	sa = expand_alpha (s);
    947 	dia = expand_alpha (d);
    948 	dia = negate (dia);
    949 	s = pix_add_mul (s, dia, d, sa);
    950 	store8888 (dest, s);
    951 
    952 	++dest;
    953 	++src;
    954 	if (mask)
    955 	    mask++;
    956     }
    957     _mm_empty ();
    958 }
    959 
    960 static void
    961 mmx_combine_xor_u (pixman_implementation_t *imp,
    962                    pixman_op_t              op,
    963                    uint32_t *               dest,
    964                    const uint32_t *         src,
    965                    const uint32_t *         mask,
    966                    int                      width)
    967 {
    968     const uint32_t *end = dest + width;
    969 
    970     while (dest < end)
    971     {
    972 	__m64 dia, d, sia;
    973 	__m64 s = combine (src, mask);
    974 
    975 	d = load8888 (dest);
    976 	sia = expand_alpha (s);
    977 	dia = expand_alpha (d);
    978 	sia = negate (sia);
    979 	dia = negate (dia);
    980 	s = pix_add_mul (s, dia, d, sia);
    981 	store8888 (dest, s);
    982 
    983 	++dest;
    984 	++src;
    985 	if (mask)
    986 	    mask++;
    987     }
    988     _mm_empty ();
    989 }
    990 
    991 static void
    992 mmx_combine_add_u (pixman_implementation_t *imp,
    993                    pixman_op_t              op,
    994                    uint32_t *               dest,
    995                    const uint32_t *         src,
    996                    const uint32_t *         mask,
    997                    int                      width)
    998 {
    999     const uint32_t *end = dest + width;
   1000 
   1001     while (dest < end)
   1002     {
   1003 	__m64 d;
   1004 	__m64 s = combine (src, mask);
   1005 
   1006 	d = load8888 (dest);
   1007 	s = pix_add (s, d);
   1008 	store8888 (dest, s);
   1009 
   1010 	++dest;
   1011 	++src;
   1012 	if (mask)
   1013 	    mask++;
   1014     }
   1015     _mm_empty ();
   1016 }
   1017 
   1018 static void
   1019 mmx_combine_saturate_u (pixman_implementation_t *imp,
   1020                         pixman_op_t              op,
   1021                         uint32_t *               dest,
   1022                         const uint32_t *         src,
   1023                         const uint32_t *         mask,
   1024                         int                      width)
   1025 {
   1026     const uint32_t *end = dest + width;
   1027 
   1028     while (dest < end)
   1029     {
   1030 	uint32_t s, sa, da;
   1031 	uint32_t d = *dest;
   1032 	__m64 ms = combine (src, mask);
   1033 	__m64 md = load8888 (dest);
   1034 
   1035 	store8888(&s, ms);
   1036 	da = ~d >> 24;
   1037 	sa = s >> 24;
   1038 
   1039 	if (sa > da)
   1040 	{
   1041 	    uint32_t quot = DIV_UN8 (da, sa) << 24;
   1042 	    __m64 msa = load8888 (&quot);
   1043 	    msa = expand_alpha (msa);
   1044 	    ms = pix_multiply (ms, msa);
   1045 	}
   1046 
   1047 	md = pix_add (md, ms);
   1048 	store8888 (dest, md);
   1049 
   1050 	++src;
   1051 	++dest;
   1052 	if (mask)
   1053 	    mask++;
   1054     }
   1055     _mm_empty ();
   1056 }
   1057 
   1058 static void
   1059 mmx_combine_src_ca (pixman_implementation_t *imp,
   1060                     pixman_op_t              op,
   1061                     uint32_t *               dest,
   1062                     const uint32_t *         src,
   1063                     const uint32_t *         mask,
   1064                     int                      width)
   1065 {
   1066     const uint32_t *end = src + width;
   1067 
   1068     while (src < end)
   1069     {
   1070 	__m64 a = load8888 (mask);
   1071 	__m64 s = load8888 (src);
   1072 
   1073 	s = pix_multiply (s, a);
   1074 	store8888 (dest, s);
   1075 
   1076 	++src;
   1077 	++mask;
   1078 	++dest;
   1079     }
   1080     _mm_empty ();
   1081 }
   1082 
   1083 static void
   1084 mmx_combine_over_ca (pixman_implementation_t *imp,
   1085                      pixman_op_t              op,
   1086                      uint32_t *               dest,
   1087                      const uint32_t *         src,
   1088                      const uint32_t *         mask,
   1089                      int                      width)
   1090 {
   1091     const uint32_t *end = src + width;
   1092 
   1093     while (src < end)
   1094     {
   1095 	__m64 a = load8888 (mask);
   1096 	__m64 s = load8888 (src);
   1097 	__m64 d = load8888 (dest);
   1098 	__m64 sa = expand_alpha (s);
   1099 
   1100 	store8888 (dest, in_over (s, sa, a, d));
   1101 
   1102 	++src;
   1103 	++dest;
   1104 	++mask;
   1105     }
   1106     _mm_empty ();
   1107 }
   1108 
   1109 static void
   1110 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
   1111                              pixman_op_t              op,
   1112                              uint32_t *               dest,
   1113                              const uint32_t *         src,
   1114                              const uint32_t *         mask,
   1115                              int                      width)
   1116 {
   1117     const uint32_t *end = src + width;
   1118 
   1119     while (src < end)
   1120     {
   1121 	__m64 a = load8888 (mask);
   1122 	__m64 s = load8888 (src);
   1123 	__m64 d = load8888 (dest);
   1124 	__m64 da = expand_alpha (d);
   1125 
   1126 	store8888 (dest, over (d, da, in (s, a)));
   1127 
   1128 	++src;
   1129 	++dest;
   1130 	++mask;
   1131     }
   1132     _mm_empty ();
   1133 }
   1134 
   1135 static void
   1136 mmx_combine_in_ca (pixman_implementation_t *imp,
   1137                    pixman_op_t              op,
   1138                    uint32_t *               dest,
   1139                    const uint32_t *         src,
   1140                    const uint32_t *         mask,
   1141                    int                      width)
   1142 {
   1143     const uint32_t *end = src + width;
   1144 
   1145     while (src < end)
   1146     {
   1147 	__m64 a = load8888 (mask);
   1148 	__m64 s = load8888 (src);
   1149 	__m64 d = load8888 (dest);
   1150 	__m64 da = expand_alpha (d);
   1151 
   1152 	s = pix_multiply (s, a);
   1153 	s = pix_multiply (s, da);
   1154 	store8888 (dest, s);
   1155 
   1156 	++src;
   1157 	++dest;
   1158 	++mask;
   1159     }
   1160     _mm_empty ();
   1161 }
   1162 
   1163 static void
   1164 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
   1165                            pixman_op_t              op,
   1166                            uint32_t *               dest,
   1167                            const uint32_t *         src,
   1168                            const uint32_t *         mask,
   1169                            int                      width)
   1170 {
   1171     const uint32_t *end = src + width;
   1172 
   1173     while (src < end)
   1174     {
   1175 	__m64 a = load8888 (mask);
   1176 	__m64 s = load8888 (src);
   1177 	__m64 d = load8888 (dest);
   1178 	__m64 sa = expand_alpha (s);
   1179 
   1180 	a = pix_multiply (a, sa);
   1181 	d = pix_multiply (d, a);
   1182 	store8888 (dest, d);
   1183 
   1184 	++src;
   1185 	++dest;
   1186 	++mask;
   1187     }
   1188     _mm_empty ();
   1189 }
   1190 
   1191 static void
   1192 mmx_combine_out_ca (pixman_implementation_t *imp,
   1193                     pixman_op_t              op,
   1194                     uint32_t *               dest,
   1195                     const uint32_t *         src,
   1196                     const uint32_t *         mask,
   1197                     int                      width)
   1198 {
   1199     const uint32_t *end = src + width;
   1200 
   1201     while (src < end)
   1202     {
   1203 	__m64 a = load8888 (mask);
   1204 	__m64 s = load8888 (src);
   1205 	__m64 d = load8888 (dest);
   1206 	__m64 da = expand_alpha (d);
   1207 
   1208 	da = negate (da);
   1209 	s = pix_multiply (s, a);
   1210 	s = pix_multiply (s, da);
   1211 	store8888 (dest, s);
   1212 
   1213 	++src;
   1214 	++dest;
   1215 	++mask;
   1216     }
   1217     _mm_empty ();
   1218 }
   1219 
   1220 static void
   1221 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
   1222                             pixman_op_t              op,
   1223                             uint32_t *               dest,
   1224                             const uint32_t *         src,
   1225                             const uint32_t *         mask,
   1226                             int                      width)
   1227 {
   1228     const uint32_t *end = src + width;
   1229 
   1230     while (src < end)
   1231     {
   1232 	__m64 a = load8888 (mask);
   1233 	__m64 s = load8888 (src);
   1234 	__m64 d = load8888 (dest);
   1235 	__m64 sa = expand_alpha (s);
   1236 
   1237 	a = pix_multiply (a, sa);
   1238 	a = negate (a);
   1239 	d = pix_multiply (d, a);
   1240 	store8888 (dest, d);
   1241 
   1242 	++src;
   1243 	++dest;
   1244 	++mask;
   1245     }
   1246     _mm_empty ();
   1247 }
   1248 
   1249 static void
   1250 mmx_combine_atop_ca (pixman_implementation_t *imp,
   1251                      pixman_op_t              op,
   1252                      uint32_t *               dest,
   1253                      const uint32_t *         src,
   1254                      const uint32_t *         mask,
   1255                      int                      width)
   1256 {
   1257     const uint32_t *end = src + width;
   1258 
   1259     while (src < end)
   1260     {
   1261 	__m64 a = load8888 (mask);
   1262 	__m64 s = load8888 (src);
   1263 	__m64 d = load8888 (dest);
   1264 	__m64 da = expand_alpha (d);
   1265 	__m64 sa = expand_alpha (s);
   1266 
   1267 	s = pix_multiply (s, a);
   1268 	a = pix_multiply (a, sa);
   1269 	a = negate (a);
   1270 	d = pix_add_mul (d, a, s, da);
   1271 	store8888 (dest, d);
   1272 
   1273 	++src;
   1274 	++dest;
   1275 	++mask;
   1276     }
   1277     _mm_empty ();
   1278 }
   1279 
   1280 static void
   1281 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
   1282                              pixman_op_t              op,
   1283                              uint32_t *               dest,
   1284                              const uint32_t *         src,
   1285                              const uint32_t *         mask,
   1286                              int                      width)
   1287 {
   1288     const uint32_t *end = src + width;
   1289 
   1290     while (src < end)
   1291     {
   1292 	__m64 a = load8888 (mask);
   1293 	__m64 s = load8888 (src);
   1294 	__m64 d = load8888 (dest);
   1295 	__m64 da = expand_alpha (d);
   1296 	__m64 sa = expand_alpha (s);
   1297 
   1298 	s = pix_multiply (s, a);
   1299 	a = pix_multiply (a, sa);
   1300 	da = negate (da);
   1301 	d = pix_add_mul (d, a, s, da);
   1302 	store8888 (dest, d);
   1303 
   1304 	++src;
   1305 	++dest;
   1306 	++mask;
   1307     }
   1308     _mm_empty ();
   1309 }
   1310 
   1311 static void
   1312 mmx_combine_xor_ca (pixman_implementation_t *imp,
   1313                     pixman_op_t              op,
   1314                     uint32_t *               dest,
   1315                     const uint32_t *         src,
   1316                     const uint32_t *         mask,
   1317                     int                      width)
   1318 {
   1319     const uint32_t *end = src + width;
   1320 
   1321     while (src < end)
   1322     {
   1323 	__m64 a = load8888 (mask);
   1324 	__m64 s = load8888 (src);
   1325 	__m64 d = load8888 (dest);
   1326 	__m64 da = expand_alpha (d);
   1327 	__m64 sa = expand_alpha (s);
   1328 
   1329 	s = pix_multiply (s, a);
   1330 	a = pix_multiply (a, sa);
   1331 	da = negate (da);
   1332 	a = negate (a);
   1333 	d = pix_add_mul (d, a, s, da);
   1334 	store8888 (dest, d);
   1335 
   1336 	++src;
   1337 	++dest;
   1338 	++mask;
   1339     }
   1340     _mm_empty ();
   1341 }
   1342 
   1343 static void
   1344 mmx_combine_add_ca (pixman_implementation_t *imp,
   1345                     pixman_op_t              op,
   1346                     uint32_t *               dest,
   1347                     const uint32_t *         src,
   1348                     const uint32_t *         mask,
   1349                     int                      width)
   1350 {
   1351     const uint32_t *end = src + width;
   1352 
   1353     while (src < end)
   1354     {
   1355 	__m64 a = load8888 (mask);
   1356 	__m64 s = load8888 (src);
   1357 	__m64 d = load8888 (dest);
   1358 
   1359 	s = pix_multiply (s, a);
   1360 	d = pix_add (s, d);
   1361 	store8888 (dest, d);
   1362 
   1363 	++src;
   1364 	++dest;
   1365 	++mask;
   1366     }
   1367     _mm_empty ();
   1368 }
   1369 
   1370 /* ------------- MMX code paths called from fbpict.c -------------------- */
   1371 
   1372 static void
   1373 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
   1374                            pixman_composite_info_t *info)
   1375 {
   1376     PIXMAN_COMPOSITE_ARGS (info);
   1377     uint32_t src;
   1378     uint32_t    *dst_line, *dst;
   1379     int32_t w;
   1380     int dst_stride;
   1381     __m64 vsrc, vsrca;
   1382 
   1383     CHECKPOINT ();
   1384 
   1385     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   1386 
   1387     if (src == 0)
   1388 	return;
   1389 
   1390     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1391 
   1392     vsrc = load8888 (&src);
   1393     vsrca = expand_alpha (vsrc);
   1394 
   1395     while (height--)
   1396     {
   1397 	dst = dst_line;
   1398 	dst_line += dst_stride;
   1399 	w = width;
   1400 
   1401 	CHECKPOINT ();
   1402 
   1403 	while (w && (uintptr_t)dst & 7)
   1404 	{
   1405 	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
   1406 
   1407 	    w--;
   1408 	    dst++;
   1409 	}
   1410 
   1411 	while (w >= 2)
   1412 	{
   1413 	    __m64 vdest;
   1414 	    __m64 dest0, dest1;
   1415 
   1416 	    vdest = *(__m64 *)dst;
   1417 
   1418 	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
   1419 	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
   1420 
   1421 	    *(__m64 *)dst = pack8888 (dest0, dest1);
   1422 
   1423 	    dst += 2;
   1424 	    w -= 2;
   1425 	}
   1426 
   1427 	CHECKPOINT ();
   1428 
   1429 	if (w)
   1430 	{
   1431 	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
   1432 	}
   1433     }
   1434 
   1435     _mm_empty ();
   1436 }
   1437 
   1438 static void
   1439 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
   1440                            pixman_composite_info_t *info)
   1441 {
   1442     PIXMAN_COMPOSITE_ARGS (info);
   1443     uint32_t src;
   1444     uint16_t    *dst_line, *dst;
   1445     int32_t w;
   1446     int dst_stride;
   1447     __m64 vsrc, vsrca;
   1448 
   1449     CHECKPOINT ();
   1450 
   1451     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   1452 
   1453     if (src == 0)
   1454 	return;
   1455 
   1456     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   1457 
   1458     vsrc = load8888 (&src);
   1459     vsrca = expand_alpha (vsrc);
   1460 
   1461     while (height--)
   1462     {
   1463 	dst = dst_line;
   1464 	dst_line += dst_stride;
   1465 	w = width;
   1466 
   1467 	CHECKPOINT ();
   1468 
   1469 	while (w && (uintptr_t)dst & 7)
   1470 	{
   1471 	    uint64_t d = *dst;
   1472 	    __m64 vdest = expand565 (to_m64 (d), 0);
   1473 
   1474 	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
   1475 	    *dst = to_uint64 (vdest);
   1476 
   1477 	    w--;
   1478 	    dst++;
   1479 	}
   1480 
   1481 	while (w >= 4)
   1482 	{
   1483 	    __m64 vdest = *(__m64 *)dst;
   1484 	    __m64 v0, v1, v2, v3;
   1485 
   1486 	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
   1487 
   1488 	    v0 = over (vsrc, vsrca, v0);
   1489 	    v1 = over (vsrc, vsrca, v1);
   1490 	    v2 = over (vsrc, vsrca, v2);
   1491 	    v3 = over (vsrc, vsrca, v3);
   1492 
   1493 	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
   1494 
   1495 	    dst += 4;
   1496 	    w -= 4;
   1497 	}
   1498 
   1499 	CHECKPOINT ();
   1500 
   1501 	while (w)
   1502 	{
   1503 	    uint64_t d = *dst;
   1504 	    __m64 vdest = expand565 (to_m64 (d), 0);
   1505 
   1506 	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
   1507 	    *dst = to_uint64 (vdest);
   1508 
   1509 	    w--;
   1510 	    dst++;
   1511 	}
   1512     }
   1513 
   1514     _mm_empty ();
   1515 }
   1516 
   1517 static void
   1518 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
   1519                                    pixman_composite_info_t *info)
   1520 {
   1521     PIXMAN_COMPOSITE_ARGS (info);
   1522     uint32_t src;
   1523     uint32_t    *dst_line;
   1524     uint32_t    *mask_line;
   1525     int dst_stride, mask_stride;
   1526     __m64 vsrc, vsrca;
   1527 
   1528     CHECKPOINT ();
   1529 
   1530     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   1531 
   1532     if (src == 0)
   1533 	return;
   1534 
   1535     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1536     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   1537 
   1538     vsrc = load8888 (&src);
   1539     vsrca = expand_alpha (vsrc);
   1540 
   1541     while (height--)
   1542     {
   1543 	int twidth = width;
   1544 	uint32_t *p = (uint32_t *)mask_line;
   1545 	uint32_t *q = (uint32_t *)dst_line;
   1546 
   1547 	while (twidth && (uintptr_t)q & 7)
   1548 	{
   1549 	    uint32_t m = *(uint32_t *)p;
   1550 
   1551 	    if (m)
   1552 	    {
   1553 		__m64 vdest = load8888 (q);
   1554 		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
   1555 		store8888 (q, vdest);
   1556 	    }
   1557 
   1558 	    twidth--;
   1559 	    p++;
   1560 	    q++;
   1561 	}
   1562 
   1563 	while (twidth >= 2)
   1564 	{
   1565 	    uint32_t m0, m1;
   1566 	    m0 = *p;
   1567 	    m1 = *(p + 1);
   1568 
   1569 	    if (m0 | m1)
   1570 	    {
   1571 		__m64 dest0, dest1;
   1572 		__m64 vdest = *(__m64 *)q;
   1573 
   1574 		dest0 = in_over (vsrc, vsrca, load8888 (&m0),
   1575 		                 expand8888 (vdest, 0));
   1576 		dest1 = in_over (vsrc, vsrca, load8888 (&m1),
   1577 		                 expand8888 (vdest, 1));
   1578 
   1579 		*(__m64 *)q = pack8888 (dest0, dest1);
   1580 	    }
   1581 
   1582 	    p += 2;
   1583 	    q += 2;
   1584 	    twidth -= 2;
   1585 	}
   1586 
   1587 	if (twidth)
   1588 	{
   1589 	    uint32_t m = *(uint32_t *)p;
   1590 
   1591 	    if (m)
   1592 	    {
   1593 		__m64 vdest = load8888 (q);
   1594 		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
   1595 		store8888 (q, vdest);
   1596 	    }
   1597 
   1598 	    twidth--;
   1599 	    p++;
   1600 	    q++;
   1601 	}
   1602 
   1603 	dst_line += dst_stride;
   1604 	mask_line += mask_stride;
   1605     }
   1606 
   1607     _mm_empty ();
   1608 }
   1609 
   1610 static void
   1611 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
   1612                                 pixman_composite_info_t *info)
   1613 {
   1614     PIXMAN_COMPOSITE_ARGS (info);
   1615     uint32_t    *dst_line, *dst;
   1616     uint32_t    *src_line, *src;
   1617     uint32_t mask;
   1618     __m64 vmask;
   1619     int dst_stride, src_stride;
   1620     int32_t w;
   1621 
   1622     CHECKPOINT ();
   1623 
   1624     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1625     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   1626 
   1627     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
   1628     vmask = expand_alpha (load8888 (&mask));
   1629 
   1630     while (height--)
   1631     {
   1632 	dst = dst_line;
   1633 	dst_line += dst_stride;
   1634 	src = src_line;
   1635 	src_line += src_stride;
   1636 	w = width;
   1637 
   1638 	while (w && (uintptr_t)dst & 7)
   1639 	{
   1640 	    __m64 s = load8888 (src);
   1641 	    __m64 d = load8888 (dst);
   1642 
   1643 	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
   1644 
   1645 	    w--;
   1646 	    dst++;
   1647 	    src++;
   1648 	}
   1649 
   1650 	while (w >= 2)
   1651 	{
   1652 	    __m64 vs = ldq_u ((__m64 *)src);
   1653 	    __m64 vd = *(__m64 *)dst;
   1654 	    __m64 vsrc0 = expand8888 (vs, 0);
   1655 	    __m64 vsrc1 = expand8888 (vs, 1);
   1656 
   1657 	    *(__m64 *)dst = pack8888 (
   1658 	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
   1659 	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
   1660 
   1661 	    w -= 2;
   1662 	    dst += 2;
   1663 	    src += 2;
   1664 	}
   1665 
   1666 	if (w)
   1667 	{
   1668 	    __m64 s = load8888 (src);
   1669 	    __m64 d = load8888 (dst);
   1670 
   1671 	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
   1672 	}
   1673     }
   1674 
   1675     _mm_empty ();
   1676 }
   1677 
   1678 static void
   1679 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
   1680                                 pixman_composite_info_t *info)
   1681 {
   1682     PIXMAN_COMPOSITE_ARGS (info);
   1683     uint32_t *dst_line, *dst;
   1684     uint32_t *src_line, *src;
   1685     uint32_t mask;
   1686     __m64 vmask;
   1687     int dst_stride, src_stride;
   1688     int32_t w;
   1689     __m64 srca;
   1690 
   1691     CHECKPOINT ();
   1692 
   1693     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1694     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   1695     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
   1696 
   1697     vmask = expand_alpha (load8888 (&mask));
   1698     srca = MC (4x00ff);
   1699 
   1700     while (height--)
   1701     {
   1702 	dst = dst_line;
   1703 	dst_line += dst_stride;
   1704 	src = src_line;
   1705 	src_line += src_stride;
   1706 	w = width;
   1707 
   1708 	while (w && (uintptr_t)dst & 7)
   1709 	{
   1710 	    uint32_t ssrc = *src | 0xff000000;
   1711 	    __m64 s = load8888 (&ssrc);
   1712 	    __m64 d = load8888 (dst);
   1713 
   1714 	    store8888 (dst, in_over (s, srca, vmask, d));
   1715 
   1716 	    w--;
   1717 	    dst++;
   1718 	    src++;
   1719 	}
   1720 
   1721 	while (w >= 16)
   1722 	{
   1723 	    __m64 vd0 = *(__m64 *)(dst + 0);
   1724 	    __m64 vd1 = *(__m64 *)(dst + 2);
   1725 	    __m64 vd2 = *(__m64 *)(dst + 4);
   1726 	    __m64 vd3 = *(__m64 *)(dst + 6);
   1727 	    __m64 vd4 = *(__m64 *)(dst + 8);
   1728 	    __m64 vd5 = *(__m64 *)(dst + 10);
   1729 	    __m64 vd6 = *(__m64 *)(dst + 12);
   1730 	    __m64 vd7 = *(__m64 *)(dst + 14);
   1731 
   1732 	    __m64 vs0 = ldq_u ((__m64 *)(src + 0));
   1733 	    __m64 vs1 = ldq_u ((__m64 *)(src + 2));
   1734 	    __m64 vs2 = ldq_u ((__m64 *)(src + 4));
   1735 	    __m64 vs3 = ldq_u ((__m64 *)(src + 6));
   1736 	    __m64 vs4 = ldq_u ((__m64 *)(src + 8));
   1737 	    __m64 vs5 = ldq_u ((__m64 *)(src + 10));
   1738 	    __m64 vs6 = ldq_u ((__m64 *)(src + 12));
   1739 	    __m64 vs7 = ldq_u ((__m64 *)(src + 14));
   1740 
   1741 	    vd0 = pack8888 (
   1742 	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
   1743 	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
   1744 
   1745 	    vd1 = pack8888 (
   1746 	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
   1747 	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
   1748 
   1749 	    vd2 = pack8888 (
   1750 	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
   1751 	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
   1752 
   1753 	    vd3 = pack8888 (
   1754 	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
   1755 	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
   1756 
   1757 	    vd4 = pack8888 (
   1758 	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
   1759 	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
   1760 
   1761 	    vd5 = pack8888 (
   1762 	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
   1763 	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
   1764 
   1765 	    vd6 = pack8888 (
   1766 	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
   1767 	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
   1768 
   1769 	    vd7 = pack8888 (
   1770 	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
   1771 	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
   1772 
   1773 	    *(__m64 *)(dst + 0) = vd0;
   1774 	    *(__m64 *)(dst + 2) = vd1;
   1775 	    *(__m64 *)(dst + 4) = vd2;
   1776 	    *(__m64 *)(dst + 6) = vd3;
   1777 	    *(__m64 *)(dst + 8) = vd4;
   1778 	    *(__m64 *)(dst + 10) = vd5;
   1779 	    *(__m64 *)(dst + 12) = vd6;
   1780 	    *(__m64 *)(dst + 14) = vd7;
   1781 
   1782 	    w -= 16;
   1783 	    dst += 16;
   1784 	    src += 16;
   1785 	}
   1786 
   1787 	while (w)
   1788 	{
   1789 	    uint32_t ssrc = *src | 0xff000000;
   1790 	    __m64 s = load8888 (&ssrc);
   1791 	    __m64 d = load8888 (dst);
   1792 
   1793 	    store8888 (dst, in_over (s, srca, vmask, d));
   1794 
   1795 	    w--;
   1796 	    dst++;
   1797 	    src++;
   1798 	}
   1799     }
   1800 
   1801     _mm_empty ();
   1802 }
   1803 
   1804 static void
   1805 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
   1806                               pixman_composite_info_t *info)
   1807 {
   1808     PIXMAN_COMPOSITE_ARGS (info);
   1809     uint32_t *dst_line, *dst;
   1810     uint32_t *src_line, *src;
   1811     uint32_t s;
   1812     int dst_stride, src_stride;
   1813     uint8_t a;
   1814     int32_t w;
   1815 
   1816     CHECKPOINT ();
   1817 
   1818     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1819     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   1820 
   1821     while (height--)
   1822     {
   1823 	dst = dst_line;
   1824 	dst_line += dst_stride;
   1825 	src = src_line;
   1826 	src_line += src_stride;
   1827 	w = width;
   1828 
   1829 	while (w--)
   1830 	{
   1831 	    s = *src++;
   1832 	    a = s >> 24;
   1833 
   1834 	    if (a == 0xff)
   1835 	    {
   1836 		*dst = s;
   1837 	    }
   1838 	    else if (s)
   1839 	    {
   1840 		__m64 ms, sa;
   1841 		ms = load8888 (&s);
   1842 		sa = expand_alpha (ms);
   1843 		store8888 (dst, over (ms, sa, load8888 (dst)));
   1844 	    }
   1845 
   1846 	    dst++;
   1847 	}
   1848     }
   1849     _mm_empty ();
   1850 }
   1851 
   1852 static void
   1853 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
   1854                               pixman_composite_info_t *info)
   1855 {
   1856     PIXMAN_COMPOSITE_ARGS (info);
   1857     uint16_t    *dst_line, *dst;
   1858     uint32_t    *src_line, *src;
   1859     int dst_stride, src_stride;
   1860     int32_t w;
   1861 
   1862     CHECKPOINT ();
   1863 
   1864     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   1865     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   1866 
   1867 #if 0
   1868     /* FIXME */
   1869     assert (src_image->drawable == mask_image->drawable);
   1870 #endif
   1871 
   1872     while (height--)
   1873     {
   1874 	dst = dst_line;
   1875 	dst_line += dst_stride;
   1876 	src = src_line;
   1877 	src_line += src_stride;
   1878 	w = width;
   1879 
   1880 	CHECKPOINT ();
   1881 
   1882 	while (w && (uintptr_t)dst & 7)
   1883 	{
   1884 	    __m64 vsrc = load8888 (src);
   1885 	    uint64_t d = *dst;
   1886 	    __m64 vdest = expand565 (to_m64 (d), 0);
   1887 
   1888 	    vdest = pack_565 (
   1889 		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
   1890 
   1891 	    *dst = to_uint64 (vdest);
   1892 
   1893 	    w--;
   1894 	    dst++;
   1895 	    src++;
   1896 	}
   1897 
   1898 	CHECKPOINT ();
   1899 
   1900 	while (w >= 4)
   1901 	{
   1902 	    __m64 vdest = *(__m64 *)dst;
   1903 	    __m64 v0, v1, v2, v3;
   1904 	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
   1905 
   1906 	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
   1907 
   1908 	    vsrc0 = load8888 ((src + 0));
   1909 	    vsrc1 = load8888 ((src + 1));
   1910 	    vsrc2 = load8888 ((src + 2));
   1911 	    vsrc3 = load8888 ((src + 3));
   1912 
   1913 	    v0 = over (vsrc0, expand_alpha (vsrc0), v0);
   1914 	    v1 = over (vsrc1, expand_alpha (vsrc1), v1);
   1915 	    v2 = over (vsrc2, expand_alpha (vsrc2), v2);
   1916 	    v3 = over (vsrc3, expand_alpha (vsrc3), v3);
   1917 
   1918 	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
   1919 
   1920 	    w -= 4;
   1921 	    dst += 4;
   1922 	    src += 4;
   1923 	}
   1924 
   1925 	CHECKPOINT ();
   1926 
   1927 	while (w)
   1928 	{
   1929 	    __m64 vsrc = load8888 (src);
   1930 	    uint64_t d = *dst;
   1931 	    __m64 vdest = expand565 (to_m64 (d), 0);
   1932 
   1933 	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
   1934 
   1935 	    *dst = to_uint64 (vdest);
   1936 
   1937 	    w--;
   1938 	    dst++;
   1939 	    src++;
   1940 	}
   1941     }
   1942 
   1943     _mm_empty ();
   1944 }
   1945 
   1946 static void
   1947 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
   1948                              pixman_composite_info_t *info)
   1949 {
   1950     PIXMAN_COMPOSITE_ARGS (info);
   1951     uint32_t src, srca;
   1952     uint32_t *dst_line, *dst;
   1953     uint8_t *mask_line, *mask;
   1954     int dst_stride, mask_stride;
   1955     int32_t w;
   1956     __m64 vsrc, vsrca;
   1957     uint64_t srcsrc;
   1958 
   1959     CHECKPOINT ();
   1960 
   1961     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   1962 
   1963     srca = src >> 24;
   1964     if (src == 0)
   1965 	return;
   1966 
   1967     srcsrc = (uint64_t)src << 32 | src;
   1968 
   1969     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1970     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   1971 
   1972     vsrc = load8888 (&src);
   1973     vsrca = expand_alpha (vsrc);
   1974 
   1975     while (height--)
   1976     {
   1977 	dst = dst_line;
   1978 	dst_line += dst_stride;
   1979 	mask = mask_line;
   1980 	mask_line += mask_stride;
   1981 	w = width;
   1982 
   1983 	CHECKPOINT ();
   1984 
   1985 	while (w && (uintptr_t)dst & 7)
   1986 	{
   1987 	    uint64_t m = *mask;
   1988 
   1989 	    if (m)
   1990 	    {
   1991 		__m64 vdest = in_over (vsrc, vsrca,
   1992 				       expand_alpha_rev (to_m64 (m)),
   1993 				       load8888 (dst));
   1994 
   1995 		store8888 (dst, vdest);
   1996 	    }
   1997 
   1998 	    w--;
   1999 	    mask++;
   2000 	    dst++;
   2001 	}
   2002 
   2003 	CHECKPOINT ();
   2004 
   2005 	while (w >= 2)
   2006 	{
   2007 	    uint64_t m0, m1;
   2008 
   2009 	    m0 = *mask;
   2010 	    m1 = *(mask + 1);
   2011 
   2012 	    if (srca == 0xff && (m0 & m1) == 0xff)
   2013 	    {
   2014 		*(uint64_t *)dst = srcsrc;
   2015 	    }
   2016 	    else if (m0 | m1)
   2017 	    {
   2018 		__m64 vdest;
   2019 		__m64 dest0, dest1;
   2020 
   2021 		vdest = *(__m64 *)dst;
   2022 
   2023 		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
   2024 				 expand8888 (vdest, 0));
   2025 		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
   2026 				 expand8888 (vdest, 1));
   2027 
   2028 		*(__m64 *)dst = pack8888 (dest0, dest1);
   2029 	    }
   2030 
   2031 	    mask += 2;
   2032 	    dst += 2;
   2033 	    w -= 2;
   2034 	}
   2035 
   2036 	CHECKPOINT ();
   2037 
   2038 	if (w)
   2039 	{
   2040 	    uint64_t m = *mask;
   2041 
   2042 	    if (m)
   2043 	    {
   2044 		__m64 vdest = load8888 (dst);
   2045 
   2046 		vdest = in_over (
   2047 		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
   2048 		store8888 (dst, vdest);
   2049 	    }
   2050 	}
   2051     }
   2052 
   2053     _mm_empty ();
   2054 }
   2055 
   2056 static pixman_bool_t
   2057 mmx_fill (pixman_implementation_t *imp,
   2058           uint32_t *               bits,
   2059           int                      stride,
   2060           int                      bpp,
   2061           int                      x,
   2062           int                      y,
   2063           int                      width,
   2064           int                      height,
   2065           uint32_t		   filler)
   2066 {
   2067     uint64_t fill;
   2068     __m64 vfill;
   2069     uint32_t byte_width;
   2070     uint8_t     *byte_line;
   2071 
   2072 #if defined __GNUC__ && defined USE_X86_MMX
   2073     __m64 v1, v2, v3, v4, v5, v6, v7;
   2074 #endif
   2075 
   2076     if (bpp != 16 && bpp != 32 && bpp != 8)
   2077 	return FALSE;
   2078 
   2079     if (bpp == 8)
   2080     {
   2081 	stride = stride * (int) sizeof (uint32_t) / 1;
   2082 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
   2083 	byte_width = width;
   2084 	stride *= 1;
   2085         filler = (filler & 0xff) * 0x01010101;
   2086     }
   2087     else if (bpp == 16)
   2088     {
   2089 	stride = stride * (int) sizeof (uint32_t) / 2;
   2090 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
   2091 	byte_width = 2 * width;
   2092 	stride *= 2;
   2093         filler = (filler & 0xffff) * 0x00010001;
   2094     }
   2095     else
   2096     {
   2097 	stride = stride * (int) sizeof (uint32_t) / 4;
   2098 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
   2099 	byte_width = 4 * width;
   2100 	stride *= 4;
   2101     }
   2102 
   2103     fill = ((uint64_t)filler << 32) | filler;
   2104     vfill = to_m64 (fill);
   2105 
   2106 #if defined __GNUC__ && defined USE_X86_MMX
   2107     __asm__ (
   2108         "movq		%7,	%0\n"
   2109         "movq		%7,	%1\n"
   2110         "movq		%7,	%2\n"
   2111         "movq		%7,	%3\n"
   2112         "movq		%7,	%4\n"
   2113         "movq		%7,	%5\n"
   2114         "movq		%7,	%6\n"
   2115 	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
   2116 	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
   2117 	: "y" (vfill));
   2118 #endif
   2119 
   2120     while (height--)
   2121     {
   2122 	int w;
   2123 	uint8_t *d = byte_line;
   2124 
   2125 	byte_line += stride;
   2126 	w = byte_width;
   2127 
   2128 	if (w >= 1 && ((uintptr_t)d & 1))
   2129 	{
   2130 	    *(uint8_t *)d = (filler & 0xff);
   2131 	    w--;
   2132 	    d++;
   2133 	}
   2134 
   2135 	if (w >= 2 && ((uintptr_t)d & 3))
   2136 	{
   2137 	    *(uint16_t *)d = filler;
   2138 	    w -= 2;
   2139 	    d += 2;
   2140 	}
   2141 
   2142 	while (w >= 4 && ((uintptr_t)d & 7))
   2143 	{
   2144 	    *(uint32_t *)d = filler;
   2145 
   2146 	    w -= 4;
   2147 	    d += 4;
   2148 	}
   2149 
   2150 	while (w >= 64)
   2151 	{
   2152 #if defined __GNUC__ && defined USE_X86_MMX
   2153 	    __asm__ (
   2154 	        "movq	%1,	  (%0)\n"
   2155 	        "movq	%2,	 8(%0)\n"
   2156 	        "movq	%3,	16(%0)\n"
   2157 	        "movq	%4,	24(%0)\n"
   2158 	        "movq	%5,	32(%0)\n"
   2159 	        "movq	%6,	40(%0)\n"
   2160 	        "movq	%7,	48(%0)\n"
   2161 	        "movq	%8,	56(%0)\n"
   2162 		:
   2163 		: "r" (d),
   2164 		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
   2165 		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
   2166 		: "memory");
   2167 #else
   2168 	    *(__m64*) (d +  0) = vfill;
   2169 	    *(__m64*) (d +  8) = vfill;
   2170 	    *(__m64*) (d + 16) = vfill;
   2171 	    *(__m64*) (d + 24) = vfill;
   2172 	    *(__m64*) (d + 32) = vfill;
   2173 	    *(__m64*) (d + 40) = vfill;
   2174 	    *(__m64*) (d + 48) = vfill;
   2175 	    *(__m64*) (d + 56) = vfill;
   2176 #endif
   2177 	    w -= 64;
   2178 	    d += 64;
   2179 	}
   2180 
   2181 	while (w >= 4)
   2182 	{
   2183 	    *(uint32_t *)d = filler;
   2184 
   2185 	    w -= 4;
   2186 	    d += 4;
   2187 	}
   2188 	if (w >= 2)
   2189 	{
   2190 	    *(uint16_t *)d = filler;
   2191 	    w -= 2;
   2192 	    d += 2;
   2193 	}
   2194 	if (w >= 1)
   2195 	{
   2196 	    *(uint8_t *)d = (filler & 0xff);
   2197 	    w--;
   2198 	    d++;
   2199 	}
   2200 
   2201     }
   2202 
   2203     _mm_empty ();
   2204     return TRUE;
   2205 }
   2206 
   2207 static void
   2208 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
   2209                              pixman_composite_info_t *info)
   2210 {
   2211     PIXMAN_COMPOSITE_ARGS (info);
   2212     uint16_t    *dst_line, *dst;
   2213     uint32_t    *src_line, *src, s;
   2214     int dst_stride, src_stride;
   2215     int32_t w;
   2216 
   2217     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2218     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2219 
   2220     while (height--)
   2221     {
   2222 	dst = dst_line;
   2223 	dst_line += dst_stride;
   2224 	src = src_line;
   2225 	src_line += src_stride;
   2226 	w = width;
   2227 
   2228 	while (w && (uintptr_t)dst & 7)
   2229 	{
   2230 	    s = *src++;
   2231 	    *dst = convert_8888_to_0565 (s);
   2232 	    dst++;
   2233 	    w--;
   2234 	}
   2235 
   2236 	while (w >= 4)
   2237 	{
   2238 	    __m64 vdest;
   2239 	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
   2240 	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
   2241 
   2242 	    vdest = pack_4xpacked565 (vsrc0, vsrc1);
   2243 
   2244 	    *(__m64 *)dst = vdest;
   2245 
   2246 	    w -= 4;
   2247 	    src += 4;
   2248 	    dst += 4;
   2249 	}
   2250 
   2251 	while (w)
   2252 	{
   2253 	    s = *src++;
   2254 	    *dst = convert_8888_to_0565 (s);
   2255 	    dst++;
   2256 	    w--;
   2257 	}
   2258     }
   2259 
   2260     _mm_empty ();
   2261 }
   2262 
   2263 static void
   2264 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
   2265                             pixman_composite_info_t *info)
   2266 {
   2267     PIXMAN_COMPOSITE_ARGS (info);
   2268     uint32_t src, srca;
   2269     uint32_t    *dst_line, *dst;
   2270     uint8_t     *mask_line, *mask;
   2271     int dst_stride, mask_stride;
   2272     int32_t w;
   2273     __m64 vsrc;
   2274     uint64_t srcsrc;
   2275 
   2276     CHECKPOINT ();
   2277 
   2278     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2279 
   2280     srca = src >> 24;
   2281     if (src == 0)
   2282     {
   2283 	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
   2284 		  PIXMAN_FORMAT_BPP (dest_image->bits.format),
   2285 		  dest_x, dest_y, width, height, 0);
   2286 	return;
   2287     }
   2288 
   2289     srcsrc = (uint64_t)src << 32 | src;
   2290 
   2291     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2292     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   2293 
   2294     vsrc = load8888 (&src);
   2295 
   2296     while (height--)
   2297     {
   2298 	dst = dst_line;
   2299 	dst_line += dst_stride;
   2300 	mask = mask_line;
   2301 	mask_line += mask_stride;
   2302 	w = width;
   2303 
   2304 	CHECKPOINT ();
   2305 
   2306 	while (w && (uintptr_t)dst & 7)
   2307 	{
   2308 	    uint64_t m = *mask;
   2309 
   2310 	    if (m)
   2311 	    {
   2312 		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
   2313 
   2314 		store8888 (dst, vdest);
   2315 	    }
   2316 	    else
   2317 	    {
   2318 		*dst = 0;
   2319 	    }
   2320 
   2321 	    w--;
   2322 	    mask++;
   2323 	    dst++;
   2324 	}
   2325 
   2326 	CHECKPOINT ();
   2327 
   2328 	while (w >= 2)
   2329 	{
   2330 	    uint64_t m0, m1;
   2331 	    m0 = *mask;
   2332 	    m1 = *(mask + 1);
   2333 
   2334 	    if (srca == 0xff && (m0 & m1) == 0xff)
   2335 	    {
   2336 		*(uint64_t *)dst = srcsrc;
   2337 	    }
   2338 	    else if (m0 | m1)
   2339 	    {
   2340 		__m64 dest0, dest1;
   2341 
   2342 		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
   2343 		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
   2344 
   2345 		*(__m64 *)dst = pack8888 (dest0, dest1);
   2346 	    }
   2347 	    else
   2348 	    {
   2349 		*(uint64_t *)dst = 0;
   2350 	    }
   2351 
   2352 	    mask += 2;
   2353 	    dst += 2;
   2354 	    w -= 2;
   2355 	}
   2356 
   2357 	CHECKPOINT ();
   2358 
   2359 	if (w)
   2360 	{
   2361 	    uint64_t m = *mask;
   2362 
   2363 	    if (m)
   2364 	    {
   2365 		__m64 vdest = load8888 (dst);
   2366 
   2367 		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
   2368 		store8888 (dst, vdest);
   2369 	    }
   2370 	    else
   2371 	    {
   2372 		*dst = 0;
   2373 	    }
   2374 	}
   2375     }
   2376 
   2377     _mm_empty ();
   2378 }
   2379 
   2380 static void
   2381 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
   2382                              pixman_composite_info_t *info)
   2383 {
   2384     PIXMAN_COMPOSITE_ARGS (info);
   2385     uint32_t src, srca;
   2386     uint16_t *dst_line, *dst;
   2387     uint8_t *mask_line, *mask;
   2388     int dst_stride, mask_stride;
   2389     int32_t w;
   2390     __m64 vsrc, vsrca, tmp;
   2391     __m64 srcsrcsrcsrc;
   2392 
   2393     CHECKPOINT ();
   2394 
   2395     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2396 
   2397     srca = src >> 24;
   2398     if (src == 0)
   2399 	return;
   2400 
   2401     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2402     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   2403 
   2404     vsrc = load8888 (&src);
   2405     vsrca = expand_alpha (vsrc);
   2406 
   2407     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
   2408     srcsrcsrcsrc = expand_alpha_rev (tmp);
   2409 
   2410     while (height--)
   2411     {
   2412 	dst = dst_line;
   2413 	dst_line += dst_stride;
   2414 	mask = mask_line;
   2415 	mask_line += mask_stride;
   2416 	w = width;
   2417 
   2418 	CHECKPOINT ();
   2419 
   2420 	while (w && (uintptr_t)dst & 7)
   2421 	{
   2422 	    uint64_t m = *mask;
   2423 
   2424 	    if (m)
   2425 	    {
   2426 		uint64_t d = *dst;
   2427 		__m64 vd = to_m64 (d);
   2428 		__m64 vdest = in_over (
   2429 		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
   2430 
   2431 		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
   2432 		*dst = to_uint64 (vd);
   2433 	    }
   2434 
   2435 	    w--;
   2436 	    mask++;
   2437 	    dst++;
   2438 	}
   2439 
   2440 	CHECKPOINT ();
   2441 
   2442 	while (w >= 4)
   2443 	{
   2444 	    uint64_t m0, m1, m2, m3;
   2445 	    m0 = *mask;
   2446 	    m1 = *(mask + 1);
   2447 	    m2 = *(mask + 2);
   2448 	    m3 = *(mask + 3);
   2449 
   2450 	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
   2451 	    {
   2452 		*(__m64 *)dst = srcsrcsrcsrc;
   2453 	    }
   2454 	    else if (m0 | m1 | m2 | m3)
   2455 	    {
   2456 		__m64 vdest = *(__m64 *)dst;
   2457 		__m64 v0, v1, v2, v3;
   2458 		__m64 vm0, vm1, vm2, vm3;
   2459 
   2460 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
   2461 
   2462 		vm0 = to_m64 (m0);
   2463 		v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
   2464 
   2465 		vm1 = to_m64 (m1);
   2466 		v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
   2467 
   2468 		vm2 = to_m64 (m2);
   2469 		v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
   2470 
   2471 		vm3 = to_m64 (m3);
   2472 		v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
   2473 
   2474 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
   2475 	    }
   2476 
   2477 	    w -= 4;
   2478 	    mask += 4;
   2479 	    dst += 4;
   2480 	}
   2481 
   2482 	CHECKPOINT ();
   2483 
   2484 	while (w)
   2485 	{
   2486 	    uint64_t m = *mask;
   2487 
   2488 	    if (m)
   2489 	    {
   2490 		uint64_t d = *dst;
   2491 		__m64 vd = to_m64 (d);
   2492 		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
   2493 				       expand565 (vd, 0));
   2494 		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
   2495 		*dst = to_uint64 (vd);
   2496 	    }
   2497 
   2498 	    w--;
   2499 	    mask++;
   2500 	    dst++;
   2501 	}
   2502     }
   2503 
   2504     _mm_empty ();
   2505 }
   2506 
   2507 static void
   2508 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
   2509                                 pixman_composite_info_t *info)
   2510 {
   2511     PIXMAN_COMPOSITE_ARGS (info);
   2512     uint16_t    *dst_line, *dst;
   2513     uint32_t    *src_line, *src;
   2514     int dst_stride, src_stride;
   2515     int32_t w;
   2516 
   2517     CHECKPOINT ();
   2518 
   2519     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2520     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2521 
   2522 #if 0
   2523     /* FIXME */
   2524     assert (src_image->drawable == mask_image->drawable);
   2525 #endif
   2526 
   2527     while (height--)
   2528     {
   2529 	dst = dst_line;
   2530 	dst_line += dst_stride;
   2531 	src = src_line;
   2532 	src_line += src_stride;
   2533 	w = width;
   2534 
   2535 	CHECKPOINT ();
   2536 
   2537 	while (w && (uintptr_t)dst & 7)
   2538 	{
   2539 	    __m64 vsrc = load8888 (src);
   2540 	    uint64_t d = *dst;
   2541 	    __m64 vdest = expand565 (to_m64 (d), 0);
   2542 
   2543 	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
   2544 
   2545 	    *dst = to_uint64 (vdest);
   2546 
   2547 	    w--;
   2548 	    dst++;
   2549 	    src++;
   2550 	}
   2551 
   2552 	CHECKPOINT ();
   2553 
   2554 	while (w >= 4)
   2555 	{
   2556 	    uint32_t s0, s1, s2, s3;
   2557 	    unsigned char a0, a1, a2, a3;
   2558 
   2559 	    s0 = *src;
   2560 	    s1 = *(src + 1);
   2561 	    s2 = *(src + 2);
   2562 	    s3 = *(src + 3);
   2563 
   2564 	    a0 = (s0 >> 24);
   2565 	    a1 = (s1 >> 24);
   2566 	    a2 = (s2 >> 24);
   2567 	    a3 = (s3 >> 24);
   2568 
   2569 	    if ((a0 & a1 & a2 & a3) == 0xFF)
   2570 	    {
   2571 		__m64 v0 = invert_colors (load8888 (&s0));
   2572 		__m64 v1 = invert_colors (load8888 (&s1));
   2573 		__m64 v2 = invert_colors (load8888 (&s2));
   2574 		__m64 v3 = invert_colors (load8888 (&s3));
   2575 
   2576 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
   2577 	    }
   2578 	    else if (s0 | s1 | s2 | s3)
   2579 	    {
   2580 		__m64 vdest = *(__m64 *)dst;
   2581 		__m64 v0, v1, v2, v3;
   2582 
   2583 		__m64 vsrc0 = load8888 (&s0);
   2584 		__m64 vsrc1 = load8888 (&s1);
   2585 		__m64 vsrc2 = load8888 (&s2);
   2586 		__m64 vsrc3 = load8888 (&s3);
   2587 
   2588 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
   2589 
   2590 		v0 = over_rev_non_pre (vsrc0, v0);
   2591 		v1 = over_rev_non_pre (vsrc1, v1);
   2592 		v2 = over_rev_non_pre (vsrc2, v2);
   2593 		v3 = over_rev_non_pre (vsrc3, v3);
   2594 
   2595 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
   2596 	    }
   2597 
   2598 	    w -= 4;
   2599 	    dst += 4;
   2600 	    src += 4;
   2601 	}
   2602 
   2603 	CHECKPOINT ();
   2604 
   2605 	while (w)
   2606 	{
   2607 	    __m64 vsrc = load8888 (src);
   2608 	    uint64_t d = *dst;
   2609 	    __m64 vdest = expand565 (to_m64 (d), 0);
   2610 
   2611 	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
   2612 
   2613 	    *dst = to_uint64 (vdest);
   2614 
   2615 	    w--;
   2616 	    dst++;
   2617 	    src++;
   2618 	}
   2619     }
   2620 
   2621     _mm_empty ();
   2622 }
   2623 
   2624 static void
   2625 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
   2626                                 pixman_composite_info_t *info)
   2627 {
   2628     PIXMAN_COMPOSITE_ARGS (info);
   2629     uint32_t    *dst_line, *dst;
   2630     uint32_t    *src_line, *src;
   2631     int dst_stride, src_stride;
   2632     int32_t w;
   2633 
   2634     CHECKPOINT ();
   2635 
   2636     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2637     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2638 
   2639 #if 0
   2640     /* FIXME */
   2641     assert (src_image->drawable == mask_image->drawable);
   2642 #endif
   2643 
   2644     while (height--)
   2645     {
   2646 	dst = dst_line;
   2647 	dst_line += dst_stride;
   2648 	src = src_line;
   2649 	src_line += src_stride;
   2650 	w = width;
   2651 
   2652 	while (w && (uintptr_t)dst & 7)
   2653 	{
   2654 	    __m64 s = load8888 (src);
   2655 	    __m64 d = load8888 (dst);
   2656 
   2657 	    store8888 (dst, over_rev_non_pre (s, d));
   2658 
   2659 	    w--;
   2660 	    dst++;
   2661 	    src++;
   2662 	}
   2663 
   2664 	while (w >= 2)
   2665 	{
   2666 	    uint32_t s0, s1;
   2667 	    unsigned char a0, a1;
   2668 	    __m64 d0, d1;
   2669 
   2670 	    s0 = *src;
   2671 	    s1 = *(src + 1);
   2672 
   2673 	    a0 = (s0 >> 24);
   2674 	    a1 = (s1 >> 24);
   2675 
   2676 	    if ((a0 & a1) == 0xFF)
   2677 	    {
   2678 		d0 = invert_colors (load8888 (&s0));
   2679 		d1 = invert_colors (load8888 (&s1));
   2680 
   2681 		*(__m64 *)dst = pack8888 (d0, d1);
   2682 	    }
   2683 	    else if (s0 | s1)
   2684 	    {
   2685 		__m64 vdest = *(__m64 *)dst;
   2686 
   2687 		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
   2688 		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
   2689 
   2690 		*(__m64 *)dst = pack8888 (d0, d1);
   2691 	    }
   2692 
   2693 	    w -= 2;
   2694 	    dst += 2;
   2695 	    src += 2;
   2696 	}
   2697 
   2698 	if (w)
   2699 	{
   2700 	    __m64 s = load8888 (src);
   2701 	    __m64 d = load8888 (dst);
   2702 
   2703 	    store8888 (dst, over_rev_non_pre (s, d));
   2704 	}
   2705     }
   2706 
   2707     _mm_empty ();
   2708 }
   2709 
   2710 static void
   2711 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
   2712                                    pixman_composite_info_t *info)
   2713 {
   2714     PIXMAN_COMPOSITE_ARGS (info);
   2715     uint32_t src;
   2716     uint16_t    *dst_line;
   2717     uint32_t    *mask_line;
   2718     int dst_stride, mask_stride;
   2719     __m64 vsrc, vsrca;
   2720 
   2721     CHECKPOINT ();
   2722 
   2723     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2724 
   2725     if (src == 0)
   2726 	return;
   2727 
   2728     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2729     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   2730 
   2731     vsrc = load8888 (&src);
   2732     vsrca = expand_alpha (vsrc);
   2733 
   2734     while (height--)
   2735     {
   2736 	int twidth = width;
   2737 	uint32_t *p = (uint32_t *)mask_line;
   2738 	uint16_t *q = (uint16_t *)dst_line;
   2739 
   2740 	while (twidth && ((uintptr_t)q & 7))
   2741 	{
   2742 	    uint32_t m = *(uint32_t *)p;
   2743 
   2744 	    if (m)
   2745 	    {
   2746 		uint64_t d = *q;
   2747 		__m64 vdest = expand565 (to_m64 (d), 0);
   2748 		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
   2749 		*q = to_uint64 (vdest);
   2750 	    }
   2751 
   2752 	    twidth--;
   2753 	    p++;
   2754 	    q++;
   2755 	}
   2756 
   2757 	while (twidth >= 4)
   2758 	{
   2759 	    uint32_t m0, m1, m2, m3;
   2760 
   2761 	    m0 = *p;
   2762 	    m1 = *(p + 1);
   2763 	    m2 = *(p + 2);
   2764 	    m3 = *(p + 3);
   2765 
   2766 	    if ((m0 | m1 | m2 | m3))
   2767 	    {
   2768 		__m64 vdest = *(__m64 *)q;
   2769 		__m64 v0, v1, v2, v3;
   2770 
   2771 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
   2772 
   2773 		v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
   2774 		v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
   2775 		v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
   2776 		v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
   2777 
   2778 		*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
   2779 	    }
   2780 	    twidth -= 4;
   2781 	    p += 4;
   2782 	    q += 4;
   2783 	}
   2784 
   2785 	while (twidth)
   2786 	{
   2787 	    uint32_t m;
   2788 
   2789 	    m = *(uint32_t *)p;
   2790 	    if (m)
   2791 	    {
   2792 		uint64_t d = *q;
   2793 		__m64 vdest = expand565 (to_m64 (d), 0);
   2794 		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
   2795 		*q = to_uint64 (vdest);
   2796 	    }
   2797 
   2798 	    twidth--;
   2799 	    p++;
   2800 	    q++;
   2801 	}
   2802 
   2803 	mask_line += mask_stride;
   2804 	dst_line += dst_stride;
   2805     }
   2806 
   2807     _mm_empty ();
   2808 }
   2809 
   2810 static void
   2811 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
   2812                         pixman_composite_info_t *info)
   2813 {
   2814     PIXMAN_COMPOSITE_ARGS (info);
   2815     uint8_t *dst_line, *dst;
   2816     uint8_t *mask_line, *mask;
   2817     int dst_stride, mask_stride;
   2818     int32_t w;
   2819     uint32_t src;
   2820     uint8_t sa;
   2821     __m64 vsrc, vsrca;
   2822 
   2823     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   2824     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   2825 
   2826     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2827 
   2828     sa = src >> 24;
   2829 
   2830     vsrc = load8888 (&src);
   2831     vsrca = expand_alpha (vsrc);
   2832 
   2833     while (height--)
   2834     {
   2835 	dst = dst_line;
   2836 	dst_line += dst_stride;
   2837 	mask = mask_line;
   2838 	mask_line += mask_stride;
   2839 	w = width;
   2840 
   2841 	while (w && (uintptr_t)dst & 7)
   2842 	{
   2843 	    uint16_t tmp;
   2844 	    uint8_t a;
   2845 	    uint32_t m, d;
   2846 
   2847 	    a = *mask++;
   2848 	    d = *dst;
   2849 
   2850 	    m = MUL_UN8 (sa, a, tmp);
   2851 	    d = MUL_UN8 (m, d, tmp);
   2852 
   2853 	    *dst++ = d;
   2854 	    w--;
   2855 	}
   2856 
   2857 	while (w >= 4)
   2858 	{
   2859 	    __m64 vmask;
   2860 	    __m64 vdest;
   2861 
   2862 	    vmask = load8888u ((uint32_t *)mask);
   2863 	    vdest = load8888 ((uint32_t *)dst);
   2864 
   2865 	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
   2866 
   2867 	    dst += 4;
   2868 	    mask += 4;
   2869 	    w -= 4;
   2870 	}
   2871 
   2872 	while (w--)
   2873 	{
   2874 	    uint16_t tmp;
   2875 	    uint8_t a;
   2876 	    uint32_t m, d;
   2877 
   2878 	    a = *mask++;
   2879 	    d = *dst;
   2880 
   2881 	    m = MUL_UN8 (sa, a, tmp);
   2882 	    d = MUL_UN8 (m, d, tmp);
   2883 
   2884 	    *dst++ = d;
   2885 	}
   2886     }
   2887 
   2888     _mm_empty ();
   2889 }
   2890 
   2891 static void
   2892 mmx_composite_in_8_8 (pixman_implementation_t *imp,
   2893                       pixman_composite_info_t *info)
   2894 {
   2895     PIXMAN_COMPOSITE_ARGS (info);
   2896     uint8_t     *dst_line, *dst;
   2897     uint8_t     *src_line, *src;
   2898     int src_stride, dst_stride;
   2899     int32_t w;
   2900 
   2901     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   2902     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
   2903 
   2904     while (height--)
   2905     {
   2906 	dst = dst_line;
   2907 	dst_line += dst_stride;
   2908 	src = src_line;
   2909 	src_line += src_stride;
   2910 	w = width;
   2911 
   2912 	while (w && (uintptr_t)dst & 3)
   2913 	{
   2914 	    uint8_t s, d;
   2915 	    uint16_t tmp;
   2916 
   2917 	    s = *src;
   2918 	    d = *dst;
   2919 
   2920 	    *dst = MUL_UN8 (s, d, tmp);
   2921 
   2922 	    src++;
   2923 	    dst++;
   2924 	    w--;
   2925 	}
   2926 
   2927 	while (w >= 4)
   2928 	{
   2929 	    uint32_t *s = (uint32_t *)src;
   2930 	    uint32_t *d = (uint32_t *)dst;
   2931 
   2932 	    store8888 (d, in (load8888u (s), load8888 (d)));
   2933 
   2934 	    w -= 4;
   2935 	    dst += 4;
   2936 	    src += 4;
   2937 	}
   2938 
   2939 	while (w--)
   2940 	{
   2941 	    uint8_t s, d;
   2942 	    uint16_t tmp;
   2943 
   2944 	    s = *src;
   2945 	    d = *dst;
   2946 
   2947 	    *dst = MUL_UN8 (s, d, tmp);
   2948 
   2949 	    src++;
   2950 	    dst++;
   2951 	}
   2952     }
   2953 
   2954     _mm_empty ();
   2955 }
   2956 
   2957 static void
   2958 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
   2959 			 pixman_composite_info_t *info)
   2960 {
   2961     PIXMAN_COMPOSITE_ARGS (info);
   2962     uint8_t     *dst_line, *dst;
   2963     uint8_t     *mask_line, *mask;
   2964     int dst_stride, mask_stride;
   2965     int32_t w;
   2966     uint32_t src;
   2967     uint8_t sa;
   2968     __m64 vsrc, vsrca;
   2969 
   2970     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   2971     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   2972 
   2973     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2974 
   2975     sa = src >> 24;
   2976 
   2977     if (src == 0)
   2978 	return;
   2979 
   2980     vsrc = load8888 (&src);
   2981     vsrca = expand_alpha (vsrc);
   2982 
   2983     while (height--)
   2984     {
   2985 	dst = dst_line;
   2986 	dst_line += dst_stride;
   2987 	mask = mask_line;
   2988 	mask_line += mask_stride;
   2989 	w = width;
   2990 
   2991 	while (w && (uintptr_t)dst & 3)
   2992 	{
   2993 	    uint16_t tmp;
   2994 	    uint16_t a;
   2995 	    uint32_t m, d;
   2996 	    uint32_t r;
   2997 
   2998 	    a = *mask++;
   2999 	    d = *dst;
   3000 
   3001 	    m = MUL_UN8 (sa, a, tmp);
   3002 	    r = ADD_UN8 (m, d, tmp);
   3003 
   3004 	    *dst++ = r;
   3005 	    w--;
   3006 	}
   3007 
   3008 	while (w >= 4)
   3009 	{
   3010 	    __m64 vmask;
   3011 	    __m64 vdest;
   3012 
   3013 	    vmask = load8888u ((uint32_t *)mask);
   3014 	    vdest = load8888 ((uint32_t *)dst);
   3015 
   3016 	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
   3017 
   3018 	    dst += 4;
   3019 	    mask += 4;
   3020 	    w -= 4;
   3021 	}
   3022 
   3023 	while (w--)
   3024 	{
   3025 	    uint16_t tmp;
   3026 	    uint16_t a;
   3027 	    uint32_t m, d;
   3028 	    uint32_t r;
   3029 
   3030 	    a = *mask++;
   3031 	    d = *dst;
   3032 
   3033 	    m = MUL_UN8 (sa, a, tmp);
   3034 	    r = ADD_UN8 (m, d, tmp);
   3035 
   3036 	    *dst++ = r;
   3037 	}
   3038     }
   3039 
   3040     _mm_empty ();
   3041 }
   3042 
   3043 static void
   3044 mmx_composite_add_8_8 (pixman_implementation_t *imp,
   3045 		       pixman_composite_info_t *info)
   3046 {
   3047     PIXMAN_COMPOSITE_ARGS (info);
   3048     uint8_t *dst_line, *dst;
   3049     uint8_t *src_line, *src;
   3050     int dst_stride, src_stride;
   3051     int32_t w;
   3052     uint8_t s, d;
   3053     uint16_t t;
   3054 
   3055     CHECKPOINT ();
   3056 
   3057     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
   3058     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   3059 
   3060     while (height--)
   3061     {
   3062 	dst = dst_line;
   3063 	dst_line += dst_stride;
   3064 	src = src_line;
   3065 	src_line += src_stride;
   3066 	w = width;
   3067 
   3068 	while (w && (uintptr_t)dst & 7)
   3069 	{
   3070 	    s = *src;
   3071 	    d = *dst;
   3072 	    t = d + s;
   3073 	    s = t | (0 - (t >> 8));
   3074 	    *dst = s;
   3075 
   3076 	    dst++;
   3077 	    src++;
   3078 	    w--;
   3079 	}
   3080 
   3081 	while (w >= 8)
   3082 	{
   3083 	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
   3084 	    dst += 8;
   3085 	    src += 8;
   3086 	    w -= 8;
   3087 	}
   3088 
   3089 	while (w)
   3090 	{
   3091 	    s = *src;
   3092 	    d = *dst;
   3093 	    t = d + s;
   3094 	    s = t | (0 - (t >> 8));
   3095 	    *dst = s;
   3096 
   3097 	    dst++;
   3098 	    src++;
   3099 	    w--;
   3100 	}
   3101     }
   3102 
   3103     _mm_empty ();
   3104 }
   3105 
   3106 static void
   3107 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
   3108                              pixman_composite_info_t *info)
   3109 {
   3110     PIXMAN_COMPOSITE_ARGS (info);
   3111     uint16_t    *dst_line, *dst;
   3112     uint32_t	d;
   3113     uint16_t    *src_line, *src;
   3114     uint32_t	s;
   3115     int dst_stride, src_stride;
   3116     int32_t w;
   3117 
   3118     CHECKPOINT ();
   3119 
   3120     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
   3121     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   3122 
   3123     while (height--)
   3124     {
   3125 	dst = dst_line;
   3126 	dst_line += dst_stride;
   3127 	src = src_line;
   3128 	src_line += src_stride;
   3129 	w = width;
   3130 
   3131 	while (w && (uintptr_t)dst & 7)
   3132 	{
   3133 	    s = *src++;
   3134 	    if (s)
   3135 	    {
   3136 		d = *dst;
   3137 		s = convert_0565_to_8888 (s);
   3138 		if (d)
   3139 		{
   3140 		    d = convert_0565_to_8888 (d);
   3141 		    UN8x4_ADD_UN8x4 (s, d);
   3142 		}
   3143 		*dst = convert_8888_to_0565 (s);
   3144 	    }
   3145 	    dst++;
   3146 	    w--;
   3147 	}
   3148 
   3149 	while (w >= 4)
   3150 	{
   3151 	    __m64 vdest = *(__m64 *)dst;
   3152 	    __m64 vsrc = ldq_u ((__m64 *)src);
   3153 	    __m64 vd0, vd1;
   3154 	    __m64 vs0, vs1;
   3155 
   3156 	    expand_4xpacked565 (vdest, &vd0, &vd1, 0);
   3157 	    expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
   3158 
   3159 	    vd0 = _mm_adds_pu8 (vd0, vs0);
   3160 	    vd1 = _mm_adds_pu8 (vd1, vs1);
   3161 
   3162 	    *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
   3163 
   3164 	    dst += 4;
   3165 	    src += 4;
   3166 	    w -= 4;
   3167 	}
   3168 
   3169 	while (w--)
   3170 	{
   3171 	    s = *src++;
   3172 	    if (s)
   3173 	    {
   3174 		d = *dst;
   3175 		s = convert_0565_to_8888 (s);
   3176 		if (d)
   3177 		{
   3178 		    d = convert_0565_to_8888 (d);
   3179 		    UN8x4_ADD_UN8x4 (s, d);
   3180 		}
   3181 		*dst = convert_8888_to_0565 (s);
   3182 	    }
   3183 	    dst++;
   3184 	}
   3185     }
   3186 
   3187     _mm_empty ();
   3188 }
   3189 
   3190 static void
   3191 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
   3192                              pixman_composite_info_t *info)
   3193 {
   3194     PIXMAN_COMPOSITE_ARGS (info);
   3195     uint32_t    *dst_line, *dst;
   3196     uint32_t    *src_line, *src;
   3197     int dst_stride, src_stride;
   3198     int32_t w;
   3199 
   3200     CHECKPOINT ();
   3201 
   3202     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3203     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3204 
   3205     while (height--)
   3206     {
   3207 	dst = dst_line;
   3208 	dst_line += dst_stride;
   3209 	src = src_line;
   3210 	src_line += src_stride;
   3211 	w = width;
   3212 
   3213 	while (w && (uintptr_t)dst & 7)
   3214 	{
   3215 	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
   3216 	                              load ((const uint32_t *)dst)));
   3217 	    dst++;
   3218 	    src++;
   3219 	    w--;
   3220 	}
   3221 
   3222 	while (w >= 2)
   3223 	{
   3224 	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
   3225 	    dst += 2;
   3226 	    src += 2;
   3227 	    w -= 2;
   3228 	}
   3229 
   3230 	if (w)
   3231 	{
   3232 	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
   3233 	                              load ((const uint32_t *)dst)));
   3234 
   3235 	}
   3236     }
   3237 
   3238     _mm_empty ();
   3239 }
   3240 
   3241 static pixman_bool_t
   3242 mmx_blt (pixman_implementation_t *imp,
   3243          uint32_t *               src_bits,
   3244          uint32_t *               dst_bits,
   3245          int                      src_stride,
   3246          int                      dst_stride,
   3247          int                      src_bpp,
   3248          int                      dst_bpp,
   3249          int                      src_x,
   3250          int                      src_y,
   3251          int                      dest_x,
   3252          int                      dest_y,
   3253          int                      width,
   3254          int                      height)
   3255 {
   3256     uint8_t *   src_bytes;
   3257     uint8_t *   dst_bytes;
   3258     int byte_width;
   3259 
   3260     if (src_bpp != dst_bpp)
   3261 	return FALSE;
   3262 
   3263     if (src_bpp == 16)
   3264     {
   3265 	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
   3266 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
   3267 	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
   3268 	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
   3269 	byte_width = 2 * width;
   3270 	src_stride *= 2;
   3271 	dst_stride *= 2;
   3272     }
   3273     else if (src_bpp == 32)
   3274     {
   3275 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
   3276 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
   3277 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
   3278 	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
   3279 	byte_width = 4 * width;
   3280 	src_stride *= 4;
   3281 	dst_stride *= 4;
   3282     }
   3283     else
   3284     {
   3285 	return FALSE;
   3286     }
   3287 
   3288     while (height--)
   3289     {
   3290 	int w;
   3291 	uint8_t *s = src_bytes;
   3292 	uint8_t *d = dst_bytes;
   3293 	src_bytes += src_stride;
   3294 	dst_bytes += dst_stride;
   3295 	w = byte_width;
   3296 
   3297 	if (w >= 1 && ((uintptr_t)d & 1))
   3298 	{
   3299 	    *(uint8_t *)d = *(uint8_t *)s;
   3300 	    w -= 1;
   3301 	    s += 1;
   3302 	    d += 1;
   3303 	}
   3304 
   3305 	if (w >= 2 && ((uintptr_t)d & 3))
   3306 	{
   3307 	    *(uint16_t *)d = *(uint16_t *)s;
   3308 	    w -= 2;
   3309 	    s += 2;
   3310 	    d += 2;
   3311 	}
   3312 
   3313 	while (w >= 4 && ((uintptr_t)d & 7))
   3314 	{
   3315 	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
   3316 
   3317 	    w -= 4;
   3318 	    s += 4;
   3319 	    d += 4;
   3320 	}
   3321 
   3322 	while (w >= 64)
   3323 	{
   3324 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
   3325 	    __asm__ (
   3326 	        "movq	  (%1),	  %%mm0\n"
   3327 	        "movq	 8(%1),	  %%mm1\n"
   3328 	        "movq	16(%1),	  %%mm2\n"
   3329 	        "movq	24(%1),	  %%mm3\n"
   3330 	        "movq	32(%1),	  %%mm4\n"
   3331 	        "movq	40(%1),	  %%mm5\n"
   3332 	        "movq	48(%1),	  %%mm6\n"
   3333 	        "movq	56(%1),	  %%mm7\n"
   3334 
   3335 	        "movq	%%mm0,	  (%0)\n"
   3336 	        "movq	%%mm1,	 8(%0)\n"
   3337 	        "movq	%%mm2,	16(%0)\n"
   3338 	        "movq	%%mm3,	24(%0)\n"
   3339 	        "movq	%%mm4,	32(%0)\n"
   3340 	        "movq	%%mm5,	40(%0)\n"
   3341 	        "movq	%%mm6,	48(%0)\n"
   3342 	        "movq	%%mm7,	56(%0)\n"
   3343 		:
   3344 		: "r" (d), "r" (s)
   3345 		: "memory",
   3346 		  "%mm0", "%mm1", "%mm2", "%mm3",
   3347 		  "%mm4", "%mm5", "%mm6", "%mm7");
   3348 #else
   3349 	    __m64 v0 = ldq_u ((__m64 *)(s + 0));
   3350 	    __m64 v1 = ldq_u ((__m64 *)(s + 8));
   3351 	    __m64 v2 = ldq_u ((__m64 *)(s + 16));
   3352 	    __m64 v3 = ldq_u ((__m64 *)(s + 24));
   3353 	    __m64 v4 = ldq_u ((__m64 *)(s + 32));
   3354 	    __m64 v5 = ldq_u ((__m64 *)(s + 40));
   3355 	    __m64 v6 = ldq_u ((__m64 *)(s + 48));
   3356 	    __m64 v7 = ldq_u ((__m64 *)(s + 56));
   3357 	    *(__m64 *)(d + 0)  = v0;
   3358 	    *(__m64 *)(d + 8)  = v1;
   3359 	    *(__m64 *)(d + 16) = v2;
   3360 	    *(__m64 *)(d + 24) = v3;
   3361 	    *(__m64 *)(d + 32) = v4;
   3362 	    *(__m64 *)(d + 40) = v5;
   3363 	    *(__m64 *)(d + 48) = v6;
   3364 	    *(__m64 *)(d + 56) = v7;
   3365 #endif
   3366 
   3367 	    w -= 64;
   3368 	    s += 64;
   3369 	    d += 64;
   3370 	}
   3371 	while (w >= 4)
   3372 	{
   3373 	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
   3374 
   3375 	    w -= 4;
   3376 	    s += 4;
   3377 	    d += 4;
   3378 	}
   3379 	if (w >= 2)
   3380 	{
   3381 	    *(uint16_t *)d = *(uint16_t *)s;
   3382 	    w -= 2;
   3383 	    s += 2;
   3384 	    d += 2;
   3385 	}
   3386     }
   3387 
   3388     _mm_empty ();
   3389 
   3390     return TRUE;
   3391 }
   3392 
   3393 static void
   3394 mmx_composite_copy_area (pixman_implementation_t *imp,
   3395                          pixman_composite_info_t *info)
   3396 {
   3397     PIXMAN_COMPOSITE_ARGS (info);
   3398 
   3399     mmx_blt (imp, src_image->bits.bits,
   3400 	     dest_image->bits.bits,
   3401 	     src_image->bits.rowstride,
   3402 	     dest_image->bits.rowstride,
   3403 	     PIXMAN_FORMAT_BPP (src_image->bits.format),
   3404 	     PIXMAN_FORMAT_BPP (dest_image->bits.format),
   3405 	     src_x, src_y, dest_x, dest_y, width, height);
   3406 }
   3407 
   3408 static void
   3409 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
   3410                                 pixman_composite_info_t *info)
   3411 {
   3412     PIXMAN_COMPOSITE_ARGS (info);
   3413     uint32_t  *src, *src_line;
   3414     uint32_t  *dst, *dst_line;
   3415     uint8_t  *mask, *mask_line;
   3416     int src_stride, mask_stride, dst_stride;
   3417     int32_t w;
   3418 
   3419     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3420     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   3421     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3422 
   3423     while (height--)
   3424     {
   3425 	src = src_line;
   3426 	src_line += src_stride;
   3427 	dst = dst_line;
   3428 	dst_line += dst_stride;
   3429 	mask = mask_line;
   3430 	mask_line += mask_stride;
   3431 
   3432 	w = width;
   3433 
   3434 	while (w--)
   3435 	{
   3436 	    uint64_t m = *mask;
   3437 
   3438 	    if (m)
   3439 	    {
   3440 		uint32_t ssrc = *src | 0xff000000;
   3441 		__m64 s = load8888 (&ssrc);
   3442 
   3443 		if (m == 0xff)
   3444 		{
   3445 		    store8888 (dst, s);
   3446 		}
   3447 		else
   3448 		{
   3449 		    __m64 sa = expand_alpha (s);
   3450 		    __m64 vm = expand_alpha_rev (to_m64 (m));
   3451 		    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
   3452 
   3453 		    store8888 (dst, vdest);
   3454 		}
   3455 	    }
   3456 
   3457 	    mask++;
   3458 	    dst++;
   3459 	    src++;
   3460 	}
   3461     }
   3462 
   3463     _mm_empty ();
   3464 }
   3465 
   3466 static void
   3467 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
   3468                                    pixman_composite_info_t *info)
   3469 {
   3470     PIXMAN_COMPOSITE_ARGS (info);
   3471     uint32_t src;
   3472     uint32_t    *dst_line, *dst;
   3473     int32_t w;
   3474     int dst_stride;
   3475     __m64 vsrc;
   3476 
   3477     CHECKPOINT ();
   3478 
   3479     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   3480 
   3481     if (src == 0)
   3482 	return;
   3483 
   3484     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3485 
   3486     vsrc = load8888 (&src);
   3487 
   3488     while (height--)
   3489     {
   3490 	dst = dst_line;
   3491 	dst_line += dst_stride;
   3492 	w = width;
   3493 
   3494 	CHECKPOINT ();
   3495 
   3496 	while (w && (uintptr_t)dst & 7)
   3497 	{
   3498 	    __m64 vdest = load8888 (dst);
   3499 
   3500 	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
   3501 
   3502 	    w--;
   3503 	    dst++;
   3504 	}
   3505 
   3506 	while (w >= 2)
   3507 	{
   3508 	    __m64 vdest = *(__m64 *)dst;
   3509 	    __m64 dest0 = expand8888 (vdest, 0);
   3510 	    __m64 dest1 = expand8888 (vdest, 1);
   3511 
   3512 
   3513 	    dest0 = over (dest0, expand_alpha (dest0), vsrc);
   3514 	    dest1 = over (dest1, expand_alpha (dest1), vsrc);
   3515 
   3516 	    *(__m64 *)dst = pack8888 (dest0, dest1);
   3517 
   3518 	    dst += 2;
   3519 	    w -= 2;
   3520 	}
   3521 
   3522 	CHECKPOINT ();
   3523 
   3524 	if (w)
   3525 	{
   3526 	    __m64 vdest = load8888 (dst);
   3527 
   3528 	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
   3529 	}
   3530     }
   3531 
   3532     _mm_empty ();
   3533 }
   3534 
   3535 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
   3536 #define BMSK (BSHIFT - 1)
   3537 
   3538 #define BILINEAR_DECLARE_VARIABLES						\
   3539     const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
   3540     const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
   3541     const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
   3542     const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
   3543     const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
   3544     const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
   3545     const __m64 mm_zero = _mm_setzero_si64 ();					\
   3546     __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
   3547 
   3548 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
   3549 do {										\
   3550     /* fetch 2x2 pixel block into 2 mmx registers */				\
   3551     __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
   3552     __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
   3553     /* vertical interpolation */						\
   3554     __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
   3555     __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
   3556     __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
   3557     __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
   3558     __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
   3559     __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
   3560     vx += unit_x;								\
   3561     if (BILINEAR_INTERPOLATION_BITS < 8)					\
   3562     {										\
   3563 	/* calculate horizontal weights */					\
   3564 	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
   3565 			  _mm_srli_pi16 (mm_x,					\
   3566 					 16 - BILINEAR_INTERPOLATION_BITS)));	\
   3567 	/* horizontal interpolation */						\
   3568 	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\
   3569 	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\
   3570 	lo = _mm_madd_pi16 (p, mm_wh);						\
   3571 	hi = _mm_madd_pi16 (q, mm_wh);						\
   3572     }										\
   3573     else									\
   3574     {										\
   3575 	/* calculate horizontal weights */					\
   3576 	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
   3577 					16 - BILINEAR_INTERPOLATION_BITS));	\
   3578 	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
   3579 					16 - BILINEAR_INTERPOLATION_BITS);	\
   3580 	/* horizontal interpolation */						\
   3581 	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
   3582 	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
   3583 	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
   3584 	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
   3585 	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\
   3586 			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\
   3587 	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\
   3588 			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\
   3589     }										\
   3590     mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
   3591     /* shift and pack the result */						\
   3592     hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
   3593     lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
   3594     lo = _mm_packs_pi32 (lo, hi);						\
   3595     lo = _mm_packs_pu16 (lo, lo);						\
   3596     pix = lo;									\
   3597 } while (0)
   3598 
   3599 #define BILINEAR_SKIP_ONE_PIXEL()						\
   3600 do {										\
   3601     vx += unit_x;								\
   3602     mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
   3603 } while(0)
   3604 
   3605 static force_inline void
   3606 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
   3607 					    const uint32_t * mask,
   3608 					    const uint32_t * src_top,
   3609 					    const uint32_t * src_bottom,
   3610 					    int32_t          w,
   3611 					    int              wt,
   3612 					    int              wb,
   3613 					    pixman_fixed_t   vx,
   3614 					    pixman_fixed_t   unit_x,
   3615 					    pixman_fixed_t   max_vx,
   3616 					    pixman_bool_t    zero_src)
   3617 {
   3618     BILINEAR_DECLARE_VARIABLES;
   3619     __m64 pix;
   3620 
   3621     while (w--)
   3622     {
   3623 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
   3624 	store (dst, pix);
   3625 	dst++;
   3626     }
   3627 
   3628     _mm_empty ();
   3629 }
   3630 
   3631 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
   3632 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
   3633 			       uint32_t, uint32_t, uint32_t,
   3634 			       COVER, FLAG_NONE)
   3635 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
   3636 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
   3637 			       uint32_t, uint32_t, uint32_t,
   3638 			       PAD, FLAG_NONE)
   3639 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
   3640 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
   3641 			       uint32_t, uint32_t, uint32_t,
   3642 			       NONE, FLAG_NONE)
   3643 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
   3644 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
   3645 			       uint32_t, uint32_t, uint32_t,
   3646 			       NORMAL, FLAG_NONE)
   3647 
   3648 static force_inline void
   3649 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
   3650 					     const uint32_t * mask,
   3651 					     const uint32_t * src_top,
   3652 					     const uint32_t * src_bottom,
   3653 					     int32_t          w,
   3654 					     int              wt,
   3655 					     int              wb,
   3656 					     pixman_fixed_t   vx,
   3657 					     pixman_fixed_t   unit_x,
   3658 					     pixman_fixed_t   max_vx,
   3659 					     pixman_bool_t    zero_src)
   3660 {
   3661     BILINEAR_DECLARE_VARIABLES;
   3662     __m64 pix1, pix2;
   3663 
   3664     while (w)
   3665     {
   3666 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   3667 
   3668 	if (!is_zero (pix1))
   3669 	{
   3670 	    pix2 = load (dst);
   3671 	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
   3672 	}
   3673 
   3674 	w--;
   3675 	dst++;
   3676     }
   3677 
   3678     _mm_empty ();
   3679 }
   3680 
   3681 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
   3682 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
   3683 			       uint32_t, uint32_t, uint32_t,
   3684 			       COVER, FLAG_NONE)
   3685 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
   3686 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
   3687 			       uint32_t, uint32_t, uint32_t,
   3688 			       PAD, FLAG_NONE)
   3689 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
   3690 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
   3691 			       uint32_t, uint32_t, uint32_t,
   3692 			       NONE, FLAG_NONE)
   3693 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
   3694 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
   3695 			       uint32_t, uint32_t, uint32_t,
   3696 			       NORMAL, FLAG_NONE)
   3697 
   3698 static force_inline void
   3699 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
   3700 					       const uint8_t  * mask,
   3701 					       const uint32_t * src_top,
   3702 					       const uint32_t * src_bottom,
   3703 					       int32_t          w,
   3704 					       int              wt,
   3705 					       int              wb,
   3706 					       pixman_fixed_t   vx,
   3707 					       pixman_fixed_t   unit_x,
   3708 					       pixman_fixed_t   max_vx,
   3709 					       pixman_bool_t    zero_src)
   3710 {
   3711     BILINEAR_DECLARE_VARIABLES;
   3712     __m64 pix1, pix2;
   3713     uint32_t m;
   3714 
   3715     while (w)
   3716     {
   3717 	m = (uint32_t) *mask++;
   3718 
   3719 	if (m)
   3720 	{
   3721 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   3722 
   3723 	    if (m == 0xff && is_opaque (pix1))
   3724 	    {
   3725 		store (dst, pix1);
   3726 	    }
   3727 	    else
   3728 	    {
   3729 		__m64 ms, md, ma, msa;
   3730 
   3731 		pix2 = load (dst);
   3732 		ma = expand_alpha_rev (to_m64 (m));
   3733 		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
   3734 		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
   3735 
   3736 		msa = expand_alpha (ms);
   3737 
   3738 		store8888 (dst, (in_over (ms, msa, ma, md)));
   3739 	    }
   3740 	}
   3741 	else
   3742 	{
   3743 	    BILINEAR_SKIP_ONE_PIXEL ();
   3744 	}
   3745 
   3746 	w--;
   3747 	dst++;
   3748     }
   3749 
   3750     _mm_empty ();
   3751 }
   3752 
   3753 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
   3754 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
   3755 			       uint32_t, uint8_t, uint32_t,
   3756 			       COVER, FLAG_HAVE_NON_SOLID_MASK)
   3757 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
   3758 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
   3759 			       uint32_t, uint8_t, uint32_t,
   3760 			       PAD, FLAG_HAVE_NON_SOLID_MASK)
   3761 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
   3762 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
   3763 			       uint32_t, uint8_t, uint32_t,
   3764 			       NONE, FLAG_HAVE_NON_SOLID_MASK)
   3765 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
   3766 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
   3767 			       uint32_t, uint8_t, uint32_t,
   3768 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
   3769 
   3770 static uint32_t *
   3771 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
   3772 {
   3773     int w = iter->width;
   3774     uint32_t *dst = iter->buffer;
   3775     uint32_t *src = (uint32_t *)iter->bits;
   3776 
   3777     iter->bits += iter->stride;
   3778 
   3779     while (w && ((uintptr_t)dst) & 7)
   3780     {
   3781 	*dst++ = (*src++) | 0xff000000;
   3782 	w--;
   3783     }
   3784 
   3785     while (w >= 8)
   3786     {
   3787 	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
   3788 	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
   3789 	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
   3790 	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
   3791 
   3792 	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
   3793 	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
   3794 	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
   3795 	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
   3796 
   3797 	dst += 8;
   3798 	src += 8;
   3799 	w -= 8;
   3800     }
   3801 
   3802     while (w)
   3803     {
   3804 	*dst++ = (*src++) | 0xff000000;
   3805 	w--;
   3806     }
   3807 
   3808     _mm_empty ();
   3809     return iter->buffer;
   3810 }
   3811 
   3812 static uint32_t *
   3813 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
   3814 {
   3815     int w = iter->width;
   3816     uint32_t *dst = iter->buffer;
   3817     uint16_t *src = (uint16_t *)iter->bits;
   3818 
   3819     iter->bits += iter->stride;
   3820 
   3821     while (w && ((uintptr_t)dst) & 0x0f)
   3822     {
   3823 	uint16_t s = *src++;
   3824 
   3825 	*dst++ = convert_0565_to_8888 (s);
   3826 	w--;
   3827     }
   3828 
   3829     while (w >= 4)
   3830     {
   3831 	__m64 vsrc = ldq_u ((__m64 *)src);
   3832 	__m64 mm0, mm1;
   3833 
   3834 	expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
   3835 
   3836 	*(__m64 *)(dst + 0) = mm0;
   3837 	*(__m64 *)(dst + 2) = mm1;
   3838 
   3839 	dst += 4;
   3840 	src += 4;
   3841 	w -= 4;
   3842     }
   3843 
   3844     while (w)
   3845     {
   3846 	uint16_t s = *src++;
   3847 
   3848 	*dst++ = convert_0565_to_8888 (s);
   3849 	w--;
   3850     }
   3851 
   3852     _mm_empty ();
   3853     return iter->buffer;
   3854 }
   3855 
   3856 static uint32_t *
   3857 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
   3858 {
   3859     int w = iter->width;
   3860     uint32_t *dst = iter->buffer;
   3861     uint8_t *src = iter->bits;
   3862 
   3863     iter->bits += iter->stride;
   3864 
   3865     while (w && (((uintptr_t)dst) & 15))
   3866     {
   3867         *dst++ = *(src++) << 24;
   3868         w--;
   3869     }
   3870 
   3871     while (w >= 8)
   3872     {
   3873 	__m64 mm0 = ldq_u ((__m64 *)src);
   3874 
   3875 	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
   3876 	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
   3877 	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
   3878 	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
   3879 	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
   3880 	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
   3881 
   3882 	*(__m64 *)(dst + 0) = mm3;
   3883 	*(__m64 *)(dst + 2) = mm4;
   3884 	*(__m64 *)(dst + 4) = mm5;
   3885 	*(__m64 *)(dst + 6) = mm6;
   3886 
   3887 	dst += 8;
   3888 	src += 8;
   3889 	w -= 8;
   3890     }
   3891 
   3892     while (w)
   3893     {
   3894 	*dst++ = *(src++) << 24;
   3895 	w--;
   3896     }
   3897 
   3898     _mm_empty ();
   3899     return iter->buffer;
   3900 }
   3901 
   3902 typedef struct
   3903 {
   3904     pixman_format_code_t	format;
   3905     pixman_iter_get_scanline_t	get_scanline;
   3906 } fetcher_info_t;
   3907 
   3908 static const fetcher_info_t fetchers[] =
   3909 {
   3910     { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
   3911     { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
   3912     { PIXMAN_a8,		mmx_fetch_a8 },
   3913     { PIXMAN_null }
   3914 };
   3915 
   3916 static pixman_bool_t
   3917 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
   3918 {
   3919     pixman_image_t *image = iter->image;
   3920 
   3921 #define FLAGS								\
   3922     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
   3923      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
   3924 
   3925     if ((iter->iter_flags & ITER_NARROW)			&&
   3926 	(iter->image_flags & FLAGS) == FLAGS)
   3927     {
   3928 	const fetcher_info_t *f;
   3929 
   3930 	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
   3931 	{
   3932 	    if (image->common.extended_format_code == f->format)
   3933 	    {
   3934 		uint8_t *b = (uint8_t *)image->bits.bits;
   3935 		int s = image->bits.rowstride * 4;
   3936 
   3937 		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
   3938 		iter->stride = s;
   3939 
   3940 		iter->get_scanline = f->get_scanline;
   3941 		return TRUE;
   3942 	    }
   3943 	}
   3944     }
   3945 
   3946     return FALSE;
   3947 }
   3948 
   3949 static const pixman_fast_path_t mmx_fast_paths[] =
   3950 {
   3951     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
   3952     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
   3953     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
   3954     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
   3955     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
   3956     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
   3957     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
   3958     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
   3959     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
   3960     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
   3961     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
   3962     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
   3963     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
   3964     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
   3965     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
   3966     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
   3967     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
   3968     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
   3969     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
   3970     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
   3971     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
   3972     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
   3973     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
   3974     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
   3975     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
   3976     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
   3977     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
   3978     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
   3979     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
   3980     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
   3981     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
   3982     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
   3983     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
   3984     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
   3985     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
   3986     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
   3987 
   3988     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
   3989     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
   3990     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
   3991     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
   3992     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
   3993     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
   3994 
   3995     PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
   3996     PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
   3997 
   3998     PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
   3999     PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
   4000     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
   4001     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
   4002     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
   4003     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
   4004 
   4005     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
   4006     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
   4007     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
   4008     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
   4009     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
   4010     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
   4011     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
   4012     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
   4013     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
   4014     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
   4015     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
   4016     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
   4017     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
   4018     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
   4019     PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
   4020     PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
   4021 
   4022     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
   4023     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
   4024 
   4025     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
   4026     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
   4027     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
   4028     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
   4029     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
   4030     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
   4031 
   4032     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
   4033     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
   4034     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
   4035     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
   4036 
   4037     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
   4038     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
   4039     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
   4040     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
   4041 
   4042     { PIXMAN_OP_NONE },
   4043 };
   4044 
   4045 pixman_implementation_t *
   4046 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
   4047 {
   4048     pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
   4049 
   4050     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
   4051     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
   4052     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
   4053     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
   4054     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
   4055     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
   4056     imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
   4057     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
   4058     imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
   4059     imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
   4060     imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
   4061 
   4062     imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
   4063     imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
   4064     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
   4065     imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
   4066     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
   4067     imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
   4068     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
   4069     imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
   4070     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
   4071     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
   4072     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
   4073 
   4074     imp->blt = mmx_blt;
   4075     imp->fill = mmx_fill;
   4076 
   4077     imp->src_iter_init = mmx_src_iter_init;
   4078 
   4079     return imp;
   4080 }
   4081 
   4082 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
   4083