Home | History | Annotate | Download | only in pixman
      1 /*
      2  * Copyright  2007 Luca Barbato
      3  *
      4  * Permission to use, copy, modify, distribute, and sell this software and its
      5  * documentation for any purpose is hereby granted without fee, provided that
      6  * the above copyright notice appear in all copies and that both that
      7  * copyright notice and this permission notice appear in supporting
      8  * documentation, and that the name of Luca Barbato not be used in advertising or
      9  * publicity pertaining to distribution of the software without specific,
     10  * written prior permission.  Luca Barbato makes no representations about the
     11  * suitability of this software for any purpose.  It is provided "as is"
     12  * without express or implied warranty.
     13  *
     14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     21  * SOFTWARE.
     22  *
     23  * Author:  Luca Barbato (lu_zero (at) gentoo.org)
     24  *
     25  * Based on fbmmx.c by Owen Taylor, Sren Sandmann and Nicholas Miell
     26  */
     27 
     28 #include <config.h>
     29 #include "pixman-private.h"
     30 #include "pixman-combine32.h"
     31 #include <altivec.h>
     32 
     33 #define AVV(x...) {x}
     34 
     35 static force_inline vector unsigned int
     36 splat_alpha (vector unsigned int pix)
     37 {
     38     return vec_perm (pix, pix,
     39 		     (vector unsigned char)AVV (
     40 			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
     41 			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
     42 }
     43 
     44 static force_inline vector unsigned int
     45 pix_multiply (vector unsigned int p, vector unsigned int a)
     46 {
     47     vector unsigned short hi, lo, mod;
     48 
     49     /* unpack to short */
     50     hi = (vector unsigned short)
     51 	vec_mergeh ((vector unsigned char)AVV (0),
     52 		    (vector unsigned char)p);
     53 
     54     mod = (vector unsigned short)
     55 	vec_mergeh ((vector unsigned char)AVV (0),
     56 		    (vector unsigned char)a);
     57 
     58     hi = vec_mladd (hi, mod, (vector unsigned short)
     59                     AVV (0x0080, 0x0080, 0x0080, 0x0080,
     60                          0x0080, 0x0080, 0x0080, 0x0080));
     61 
     62     hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
     63 
     64     hi = vec_sr (hi, vec_splat_u16 (8));
     65 
     66     /* unpack to short */
     67     lo = (vector unsigned short)
     68 	vec_mergel ((vector unsigned char)AVV (0),
     69 		    (vector unsigned char)p);
     70     mod = (vector unsigned short)
     71 	vec_mergel ((vector unsigned char)AVV (0),
     72 		    (vector unsigned char)a);
     73 
     74     lo = vec_mladd (lo, mod, (vector unsigned short)
     75                     AVV (0x0080, 0x0080, 0x0080, 0x0080,
     76                          0x0080, 0x0080, 0x0080, 0x0080));
     77 
     78     lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
     79 
     80     lo = vec_sr (lo, vec_splat_u16 (8));
     81 
     82     return (vector unsigned int)vec_packsu (hi, lo);
     83 }
     84 
     85 static force_inline vector unsigned int
     86 pix_add (vector unsigned int a, vector unsigned int b)
     87 {
     88     return (vector unsigned int)vec_adds ((vector unsigned char)a,
     89                                           (vector unsigned char)b);
     90 }
     91 
     92 static force_inline vector unsigned int
     93 pix_add_mul (vector unsigned int x,
     94              vector unsigned int a,
     95              vector unsigned int y,
     96              vector unsigned int b)
     97 {
     98     vector unsigned int t1, t2;
     99 
    100     t1 = pix_multiply (x, a);
    101     t2 = pix_multiply (y, b);
    102 
    103     return pix_add (t1, t2);
    104 }
    105 
    106 static force_inline vector unsigned int
    107 negate (vector unsigned int src)
    108 {
    109     return vec_nor (src, src);
    110 }
    111 
    112 /* dest*~srca + src */
    113 static force_inline vector unsigned int
    114 over (vector unsigned int src,
    115       vector unsigned int srca,
    116       vector unsigned int dest)
    117 {
    118     vector unsigned char tmp = (vector unsigned char)
    119 	pix_multiply (dest, negate (srca));
    120 
    121     tmp = vec_adds ((vector unsigned char)src, tmp);
    122     return (vector unsigned int)tmp;
    123 }
    124 
    125 /* in == pix_multiply */
    126 #define in_over(src, srca, mask, dest)					\
    127     over (pix_multiply (src, mask),					\
    128           pix_multiply (srca, mask), dest)
    129 
    130 
    131 #define COMPUTE_SHIFT_MASK(source)					\
    132     source ## _mask = vec_lvsl (0, source);
    133 
    134 #define COMPUTE_SHIFT_MASKS(dest, source)				\
    135     dest ## _mask = vec_lvsl (0, dest);					\
    136     source ## _mask = vec_lvsl (0, source);				\
    137     store_mask = vec_lvsr (0, dest);
    138 
    139 #define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
    140     mask ## _mask = vec_lvsl (0, mask);					\
    141     dest ## _mask = vec_lvsl (0, dest);					\
    142     source ## _mask = vec_lvsl (0, source);				\
    143     store_mask = vec_lvsr (0, dest);
    144 
    145 /* notice you have to declare temp vars...
    146  * Note: tmp3 and tmp4 must remain untouched!
    147  */
    148 
    149 #define LOAD_VECTORS(dest, source)			  \
    150     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
    151     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
    152     tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
    153     v ## source = (typeof(v ## source))			  \
    154 	vec_perm (tmp1, tmp2, source ## _mask);		  \
    155     tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
    156     v ## dest = (typeof(v ## dest))			  \
    157 	vec_perm (tmp3, tmp4, dest ## _mask);
    158 
    159 #define LOAD_VECTORSC(dest, source, mask)		  \
    160     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
    161     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
    162     tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
    163     v ## source = (typeof(v ## source))			  \
    164 	vec_perm (tmp1, tmp2, source ## _mask);		  \
    165     tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
    166     tmp1 = (typeof(tmp1))vec_ld (0, mask);		  \
    167     v ## dest = (typeof(v ## dest))			  \
    168 	vec_perm (tmp3, tmp4, dest ## _mask);		  \
    169     tmp2 = (typeof(tmp2))vec_ld (15, mask);		  \
    170     v ## mask = (typeof(v ## mask))			  \
    171 	vec_perm (tmp1, tmp2, mask ## _mask);
    172 
    173 #define LOAD_VECTORSM(dest, source, mask)				\
    174     LOAD_VECTORSC (dest, source, mask)					\
    175     v ## source = pix_multiply (v ## source,				\
    176                                 splat_alpha (v ## mask));
    177 
    178 #define STORE_VECTOR(dest)						\
    179     edges = vec_perm (tmp4, tmp3, dest ## _mask);			\
    180     tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
    181     tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
    182     vec_st ((vector unsigned int) tmp3, 15, dest);			\
    183     vec_st ((vector unsigned int) tmp1, 0, dest);
    184 
    185 static void
    186 vmx_combine_over_u_no_mask (uint32_t *      dest,
    187                             const uint32_t *src,
    188                             int             width)
    189 {
    190     int i;
    191     vector unsigned int vdest, vsrc;
    192     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    193 	dest_mask, src_mask, store_mask;
    194 
    195     COMPUTE_SHIFT_MASKS (dest, src);
    196 
    197     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    198     for (i = width / 4; i > 0; i--)
    199     {
    200 
    201 	LOAD_VECTORS (dest, src);
    202 
    203 	vdest = over (vsrc, splat_alpha (vsrc), vdest);
    204 
    205 	STORE_VECTOR (dest);
    206 
    207 	src += 4;
    208 	dest += 4;
    209     }
    210 
    211     for (i = width % 4; --i >= 0;)
    212     {
    213 	uint32_t s = src[i];
    214 	uint32_t d = dest[i];
    215 	uint32_t ia = ALPHA_8 (~s);
    216 
    217 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
    218 
    219 	dest[i] = d;
    220     }
    221 }
    222 
    223 static void
    224 vmx_combine_over_u_mask (uint32_t *      dest,
    225                          const uint32_t *src,
    226                          const uint32_t *mask,
    227                          int             width)
    228 {
    229     int i;
    230     vector unsigned int vdest, vsrc, vmask;
    231     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    232 	dest_mask, src_mask, mask_mask, store_mask;
    233 
    234     COMPUTE_SHIFT_MASKC (dest, src, mask);
    235 
    236     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    237     for (i = width / 4; i > 0; i--)
    238     {
    239 	LOAD_VECTORSM (dest, src, mask);
    240 
    241 	vdest = over (vsrc, splat_alpha (vsrc), vdest);
    242 
    243 	STORE_VECTOR (dest);
    244 
    245 	src += 4;
    246 	dest += 4;
    247 	mask += 4;
    248     }
    249 
    250     for (i = width % 4; --i >= 0;)
    251     {
    252 	uint32_t m = ALPHA_8 (mask[i]);
    253 	uint32_t s = src[i];
    254 	uint32_t d = dest[i];
    255 	uint32_t ia;
    256 
    257 	UN8x4_MUL_UN8 (s, m);
    258 
    259 	ia = ALPHA_8 (~s);
    260 
    261 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
    262 	dest[i] = d;
    263     }
    264 }
    265 
    266 static void
    267 vmx_combine_over_u (pixman_implementation_t *imp,
    268                     pixman_op_t              op,
    269                     uint32_t *               dest,
    270                     const uint32_t *         src,
    271                     const uint32_t *         mask,
    272                     int                      width)
    273 {
    274     if (mask)
    275 	vmx_combine_over_u_mask (dest, src, mask, width);
    276     else
    277 	vmx_combine_over_u_no_mask (dest, src, width);
    278 }
    279 
    280 static void
    281 vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
    282                                     const uint32_t *src,
    283                                     int             width)
    284 {
    285     int i;
    286     vector unsigned int vdest, vsrc;
    287     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    288 	dest_mask, src_mask, store_mask;
    289 
    290     COMPUTE_SHIFT_MASKS (dest, src);
    291 
    292     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    293     for (i = width / 4; i > 0; i--)
    294     {
    295 
    296 	LOAD_VECTORS (dest, src);
    297 
    298 	vdest = over (vdest, splat_alpha (vdest), vsrc);
    299 
    300 	STORE_VECTOR (dest);
    301 
    302 	src += 4;
    303 	dest += 4;
    304     }
    305 
    306     for (i = width % 4; --i >= 0;)
    307     {
    308 	uint32_t s = src[i];
    309 	uint32_t d = dest[i];
    310 	uint32_t ia = ALPHA_8 (~dest[i]);
    311 
    312 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
    313 	dest[i] = s;
    314     }
    315 }
    316 
    317 static void
    318 vmx_combine_over_reverse_u_mask (uint32_t *      dest,
    319                                  const uint32_t *src,
    320                                  const uint32_t *mask,
    321                                  int             width)
    322 {
    323     int i;
    324     vector unsigned int vdest, vsrc, vmask;
    325     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    326 	dest_mask, src_mask, mask_mask, store_mask;
    327 
    328     COMPUTE_SHIFT_MASKC (dest, src, mask);
    329 
    330     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    331     for (i = width / 4; i > 0; i--)
    332     {
    333 
    334 	LOAD_VECTORSM (dest, src, mask);
    335 
    336 	vdest = over (vdest, splat_alpha (vdest), vsrc);
    337 
    338 	STORE_VECTOR (dest);
    339 
    340 	src += 4;
    341 	dest += 4;
    342 	mask += 4;
    343     }
    344 
    345     for (i = width % 4; --i >= 0;)
    346     {
    347 	uint32_t m = ALPHA_8 (mask[i]);
    348 	uint32_t s = src[i];
    349 	uint32_t d = dest[i];
    350 	uint32_t ia = ALPHA_8 (~dest[i]);
    351 
    352 	UN8x4_MUL_UN8 (s, m);
    353 
    354 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
    355 	dest[i] = s;
    356     }
    357 }
    358 
    359 static void
    360 vmx_combine_over_reverse_u (pixman_implementation_t *imp,
    361                             pixman_op_t              op,
    362                             uint32_t *               dest,
    363                             const uint32_t *         src,
    364                             const uint32_t *         mask,
    365                             int                      width)
    366 {
    367     if (mask)
    368 	vmx_combine_over_reverse_u_mask (dest, src, mask, width);
    369     else
    370 	vmx_combine_over_reverse_u_no_mask (dest, src, width);
    371 }
    372 
    373 static void
    374 vmx_combine_in_u_no_mask (uint32_t *      dest,
    375                           const uint32_t *src,
    376                           int             width)
    377 {
    378     int i;
    379     vector unsigned int vdest, vsrc;
    380     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    381 	dest_mask, src_mask, store_mask;
    382 
    383     COMPUTE_SHIFT_MASKS (dest, src);
    384 
    385     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    386     for (i = width / 4; i > 0; i--)
    387     {
    388 	LOAD_VECTORS (dest, src);
    389 
    390 	vdest = pix_multiply (vsrc, splat_alpha (vdest));
    391 
    392 	STORE_VECTOR (dest);
    393 
    394 	src += 4;
    395 	dest += 4;
    396     }
    397 
    398     for (i = width % 4; --i >= 0;)
    399     {
    400 	uint32_t s = src[i];
    401 	uint32_t a = ALPHA_8 (dest[i]);
    402 
    403 	UN8x4_MUL_UN8 (s, a);
    404 	dest[i] = s;
    405     }
    406 }
    407 
    408 static void
    409 vmx_combine_in_u_mask (uint32_t *      dest,
    410                        const uint32_t *src,
    411                        const uint32_t *mask,
    412                        int             width)
    413 {
    414     int i;
    415     vector unsigned int vdest, vsrc, vmask;
    416     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    417 	dest_mask, src_mask, mask_mask, store_mask;
    418 
    419     COMPUTE_SHIFT_MASKC (dest, src, mask);
    420 
    421     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    422     for (i = width / 4; i > 0; i--)
    423     {
    424 	LOAD_VECTORSM (dest, src, mask);
    425 
    426 	vdest = pix_multiply (vsrc, splat_alpha (vdest));
    427 
    428 	STORE_VECTOR (dest);
    429 
    430 	src += 4;
    431 	dest += 4;
    432 	mask += 4;
    433     }
    434 
    435     for (i = width % 4; --i >= 0;)
    436     {
    437 	uint32_t m = ALPHA_8 (mask[i]);
    438 	uint32_t s = src[i];
    439 	uint32_t a = ALPHA_8 (dest[i]);
    440 
    441 	UN8x4_MUL_UN8 (s, m);
    442 	UN8x4_MUL_UN8 (s, a);
    443 
    444 	dest[i] = s;
    445     }
    446 }
    447 
    448 static void
    449 vmx_combine_in_u (pixman_implementation_t *imp,
    450                   pixman_op_t              op,
    451                   uint32_t *               dest,
    452                   const uint32_t *         src,
    453                   const uint32_t *         mask,
    454                   int                      width)
    455 {
    456     if (mask)
    457 	vmx_combine_in_u_mask (dest, src, mask, width);
    458     else
    459 	vmx_combine_in_u_no_mask (dest, src, width);
    460 }
    461 
    462 static void
    463 vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
    464                                   const uint32_t *src,
    465                                   int             width)
    466 {
    467     int i;
    468     vector unsigned int vdest, vsrc;
    469     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    470 	dest_mask, src_mask, store_mask;
    471 
    472     COMPUTE_SHIFT_MASKS (dest, src);
    473 
    474     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    475     for (i = width / 4; i > 0; i--)
    476     {
    477 	LOAD_VECTORS (dest, src);
    478 
    479 	vdest = pix_multiply (vdest, splat_alpha (vsrc));
    480 
    481 	STORE_VECTOR (dest);
    482 
    483 	src += 4;
    484 	dest += 4;
    485     }
    486 
    487     for (i = width % 4; --i >= 0;)
    488     {
    489 	uint32_t d = dest[i];
    490 	uint32_t a = ALPHA_8 (src[i]);
    491 
    492 	UN8x4_MUL_UN8 (d, a);
    493 
    494 	dest[i] = d;
    495     }
    496 }
    497 
    498 static void
    499 vmx_combine_in_reverse_u_mask (uint32_t *      dest,
    500                                const uint32_t *src,
    501                                const uint32_t *mask,
    502                                int             width)
    503 {
    504     int i;
    505     vector unsigned int vdest, vsrc, vmask;
    506     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    507 	dest_mask, src_mask, mask_mask, store_mask;
    508 
    509     COMPUTE_SHIFT_MASKC (dest, src, mask);
    510 
    511     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    512     for (i = width / 4; i > 0; i--)
    513     {
    514 	LOAD_VECTORSM (dest, src, mask);
    515 
    516 	vdest = pix_multiply (vdest, splat_alpha (vsrc));
    517 
    518 	STORE_VECTOR (dest);
    519 
    520 	src += 4;
    521 	dest += 4;
    522 	mask += 4;
    523     }
    524 
    525     for (i = width % 4; --i >= 0;)
    526     {
    527 	uint32_t m = ALPHA_8 (mask[i]);
    528 	uint32_t d = dest[i];
    529 	uint32_t a = src[i];
    530 
    531 	UN8x4_MUL_UN8 (a, m);
    532 	a = ALPHA_8 (a);
    533 	UN8x4_MUL_UN8 (d, a);
    534 
    535 	dest[i] = d;
    536     }
    537 }
    538 
    539 static void
    540 vmx_combine_in_reverse_u (pixman_implementation_t *imp,
    541                           pixman_op_t              op,
    542                           uint32_t *               dest,
    543                           const uint32_t *         src,
    544                           const uint32_t *         mask,
    545                           int                      width)
    546 {
    547     if (mask)
    548 	vmx_combine_in_reverse_u_mask (dest, src, mask, width);
    549     else
    550 	vmx_combine_in_reverse_u_no_mask (dest, src, width);
    551 }
    552 
    553 static void
    554 vmx_combine_out_u_no_mask (uint32_t *      dest,
    555                            const uint32_t *src,
    556                            int             width)
    557 {
    558     int i;
    559     vector unsigned int vdest, vsrc;
    560     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    561 	dest_mask, src_mask, store_mask;
    562 
    563     COMPUTE_SHIFT_MASKS (dest, src);
    564 
    565     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    566     for (i = width / 4; i > 0; i--)
    567     {
    568 	LOAD_VECTORS (dest, src);
    569 
    570 	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
    571 
    572 	STORE_VECTOR (dest);
    573 
    574 	src += 4;
    575 	dest += 4;
    576     }
    577 
    578     for (i = width % 4; --i >= 0;)
    579     {
    580 	uint32_t s = src[i];
    581 	uint32_t a = ALPHA_8 (~dest[i]);
    582 
    583 	UN8x4_MUL_UN8 (s, a);
    584 
    585 	dest[i] = s;
    586     }
    587 }
    588 
    589 static void
    590 vmx_combine_out_u_mask (uint32_t *      dest,
    591                         const uint32_t *src,
    592                         const uint32_t *mask,
    593                         int             width)
    594 {
    595     int i;
    596     vector unsigned int vdest, vsrc, vmask;
    597     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    598 	dest_mask, src_mask, mask_mask, store_mask;
    599 
    600     COMPUTE_SHIFT_MASKC (dest, src, mask);
    601 
    602     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    603     for (i = width / 4; i > 0; i--)
    604     {
    605 	LOAD_VECTORSM (dest, src, mask);
    606 
    607 	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
    608 
    609 	STORE_VECTOR (dest);
    610 
    611 	src += 4;
    612 	dest += 4;
    613 	mask += 4;
    614     }
    615 
    616     for (i = width % 4; --i >= 0;)
    617     {
    618 	uint32_t m = ALPHA_8 (mask[i]);
    619 	uint32_t s = src[i];
    620 	uint32_t a = ALPHA_8 (~dest[i]);
    621 
    622 	UN8x4_MUL_UN8 (s, m);
    623 	UN8x4_MUL_UN8 (s, a);
    624 
    625 	dest[i] = s;
    626     }
    627 }
    628 
    629 static void
    630 vmx_combine_out_u (pixman_implementation_t *imp,
    631                    pixman_op_t              op,
    632                    uint32_t *               dest,
    633                    const uint32_t *         src,
    634                    const uint32_t *         mask,
    635                    int                      width)
    636 {
    637     if (mask)
    638 	vmx_combine_out_u_mask (dest, src, mask, width);
    639     else
    640 	vmx_combine_out_u_no_mask (dest, src, width);
    641 }
    642 
    643 static void
    644 vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
    645                                    const uint32_t *src,
    646                                    int             width)
    647 {
    648     int i;
    649     vector unsigned int vdest, vsrc;
    650     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    651 	dest_mask, src_mask, store_mask;
    652 
    653     COMPUTE_SHIFT_MASKS (dest, src);
    654 
    655     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    656     for (i = width / 4; i > 0; i--)
    657     {
    658 
    659 	LOAD_VECTORS (dest, src);
    660 
    661 	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
    662 
    663 	STORE_VECTOR (dest);
    664 
    665 	src += 4;
    666 	dest += 4;
    667     }
    668 
    669     for (i = width % 4; --i >= 0;)
    670     {
    671 	uint32_t d = dest[i];
    672 	uint32_t a = ALPHA_8 (~src[i]);
    673 
    674 	UN8x4_MUL_UN8 (d, a);
    675 
    676 	dest[i] = d;
    677     }
    678 }
    679 
    680 static void
    681 vmx_combine_out_reverse_u_mask (uint32_t *      dest,
    682                                 const uint32_t *src,
    683                                 const uint32_t *mask,
    684                                 int             width)
    685 {
    686     int i;
    687     vector unsigned int vdest, vsrc, vmask;
    688     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    689 	dest_mask, src_mask, mask_mask, store_mask;
    690 
    691     COMPUTE_SHIFT_MASKC (dest, src, mask);
    692 
    693     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    694     for (i = width / 4; i > 0; i--)
    695     {
    696 	LOAD_VECTORSM (dest, src, mask);
    697 
    698 	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
    699 
    700 	STORE_VECTOR (dest);
    701 
    702 	src += 4;
    703 	dest += 4;
    704 	mask += 4;
    705     }
    706 
    707     for (i = width % 4; --i >= 0;)
    708     {
    709 	uint32_t m = ALPHA_8 (mask[i]);
    710 	uint32_t d = dest[i];
    711 	uint32_t a = src[i];
    712 
    713 	UN8x4_MUL_UN8 (a, m);
    714 	a = ALPHA_8 (~a);
    715 	UN8x4_MUL_UN8 (d, a);
    716 
    717 	dest[i] = d;
    718     }
    719 }
    720 
    721 static void
    722 vmx_combine_out_reverse_u (pixman_implementation_t *imp,
    723                            pixman_op_t              op,
    724                            uint32_t *               dest,
    725                            const uint32_t *         src,
    726                            const uint32_t *         mask,
    727                            int                      width)
    728 {
    729     if (mask)
    730 	vmx_combine_out_reverse_u_mask (dest, src, mask, width);
    731     else
    732 	vmx_combine_out_reverse_u_no_mask (dest, src, width);
    733 }
    734 
    735 static void
    736 vmx_combine_atop_u_no_mask (uint32_t *      dest,
    737                             const uint32_t *src,
    738                             int             width)
    739 {
    740     int i;
    741     vector unsigned int vdest, vsrc;
    742     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    743 	dest_mask, src_mask, store_mask;
    744 
    745     COMPUTE_SHIFT_MASKS (dest, src);
    746 
    747     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    748     for (i = width / 4; i > 0; i--)
    749     {
    750 	LOAD_VECTORS (dest, src);
    751 
    752 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
    753 			     vdest, splat_alpha (negate (vsrc)));
    754 
    755 	STORE_VECTOR (dest);
    756 
    757 	src += 4;
    758 	dest += 4;
    759     }
    760 
    761     for (i = width % 4; --i >= 0;)
    762     {
    763 	uint32_t s = src[i];
    764 	uint32_t d = dest[i];
    765 	uint32_t dest_a = ALPHA_8 (d);
    766 	uint32_t src_ia = ALPHA_8 (~s);
    767 
    768 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
    769 
    770 	dest[i] = s;
    771     }
    772 }
    773 
    774 static void
    775 vmx_combine_atop_u_mask (uint32_t *      dest,
    776                          const uint32_t *src,
    777                          const uint32_t *mask,
    778                          int             width)
    779 {
    780     int i;
    781     vector unsigned int vdest, vsrc, vmask;
    782     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    783 	dest_mask, src_mask, mask_mask, store_mask;
    784 
    785     COMPUTE_SHIFT_MASKC (dest, src, mask);
    786 
    787     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    788     for (i = width / 4; i > 0; i--)
    789     {
    790 	LOAD_VECTORSM (dest, src, mask);
    791 
    792 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
    793 			     vdest, splat_alpha (negate (vsrc)));
    794 
    795 	STORE_VECTOR (dest);
    796 
    797 	src += 4;
    798 	dest += 4;
    799 	mask += 4;
    800     }
    801 
    802     for (i = width % 4; --i >= 0;)
    803     {
    804 	uint32_t m = ALPHA_8 (mask[i]);
    805 	uint32_t s = src[i];
    806 	uint32_t d = dest[i];
    807 	uint32_t dest_a = ALPHA_8 (d);
    808 	uint32_t src_ia;
    809 
    810 	UN8x4_MUL_UN8 (s, m);
    811 
    812 	src_ia = ALPHA_8 (~s);
    813 
    814 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
    815 
    816 	dest[i] = s;
    817     }
    818 }
    819 
    820 static void
    821 vmx_combine_atop_u (pixman_implementation_t *imp,
    822                     pixman_op_t              op,
    823                     uint32_t *               dest,
    824                     const uint32_t *         src,
    825                     const uint32_t *         mask,
    826                     int                      width)
    827 {
    828     if (mask)
    829 	vmx_combine_atop_u_mask (dest, src, mask, width);
    830     else
    831 	vmx_combine_atop_u_no_mask (dest, src, width);
    832 }
    833 
    834 static void
    835 vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
    836                                     const uint32_t *src,
    837                                     int             width)
    838 {
    839     int i;
    840     vector unsigned int vdest, vsrc;
    841     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    842 	dest_mask, src_mask, store_mask;
    843 
    844     COMPUTE_SHIFT_MASKS (dest, src);
    845 
    846     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    847     for (i = width / 4; i > 0; i--)
    848     {
    849 	LOAD_VECTORS (dest, src);
    850 
    851 	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
    852 			     vsrc, splat_alpha (negate (vdest)));
    853 
    854 	STORE_VECTOR (dest);
    855 
    856 	src += 4;
    857 	dest += 4;
    858     }
    859 
    860     for (i = width % 4; --i >= 0;)
    861     {
    862 	uint32_t s = src[i];
    863 	uint32_t d = dest[i];
    864 	uint32_t src_a = ALPHA_8 (s);
    865 	uint32_t dest_ia = ALPHA_8 (~d);
    866 
    867 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
    868 
    869 	dest[i] = s;
    870     }
    871 }
    872 
    873 static void
    874 vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
    875                                  const uint32_t *src,
    876                                  const uint32_t *mask,
    877                                  int             width)
    878 {
    879     int i;
    880     vector unsigned int vdest, vsrc, vmask;
    881     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    882 	dest_mask, src_mask, mask_mask, store_mask;
    883 
    884     COMPUTE_SHIFT_MASKC (dest, src, mask);
    885 
    886     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    887     for (i = width / 4; i > 0; i--)
    888     {
    889 	LOAD_VECTORSM (dest, src, mask);
    890 
    891 	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
    892 			     vsrc, splat_alpha (negate (vdest)));
    893 
    894 	STORE_VECTOR (dest);
    895 
    896 	src += 4;
    897 	dest += 4;
    898 	mask += 4;
    899     }
    900 
    901     for (i = width % 4; --i >= 0;)
    902     {
    903 	uint32_t m = ALPHA_8 (mask[i]);
    904 	uint32_t s = src[i];
    905 	uint32_t d = dest[i];
    906 	uint32_t src_a;
    907 	uint32_t dest_ia = ALPHA_8 (~d);
    908 
    909 	UN8x4_MUL_UN8 (s, m);
    910 
    911 	src_a = ALPHA_8 (s);
    912 
    913 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
    914 
    915 	dest[i] = s;
    916     }
    917 }
    918 
    919 static void
    920 vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
    921                             pixman_op_t              op,
    922                             uint32_t *               dest,
    923                             const uint32_t *         src,
    924                             const uint32_t *         mask,
    925                             int                      width)
    926 {
    927     if (mask)
    928 	vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
    929     else
    930 	vmx_combine_atop_reverse_u_no_mask (dest, src, width);
    931 }
    932 
    933 static void
    934 vmx_combine_xor_u_no_mask (uint32_t *      dest,
    935                            const uint32_t *src,
    936                            int             width)
    937 {
    938     int i;
    939     vector unsigned int vdest, vsrc;
    940     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    941 	dest_mask, src_mask, store_mask;
    942 
    943     COMPUTE_SHIFT_MASKS (dest, src);
    944 
    945     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    946     for (i = width / 4; i > 0; i--)
    947     {
    948 	LOAD_VECTORS (dest, src);
    949 
    950 	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
    951 			     vdest, splat_alpha (negate (vsrc)));
    952 
    953 	STORE_VECTOR (dest);
    954 
    955 	src += 4;
    956 	dest += 4;
    957     }
    958 
    959     for (i = width % 4; --i >= 0;)
    960     {
    961 	uint32_t s = src[i];
    962 	uint32_t d = dest[i];
    963 	uint32_t src_ia = ALPHA_8 (~s);
    964 	uint32_t dest_ia = ALPHA_8 (~d);
    965 
    966 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
    967 
    968 	dest[i] = s;
    969     }
    970 }
    971 
    972 static void
    973 vmx_combine_xor_u_mask (uint32_t *      dest,
    974                         const uint32_t *src,
    975                         const uint32_t *mask,
    976                         int             width)
    977 {
    978     int i;
    979     vector unsigned int vdest, vsrc, vmask;
    980     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
    981 	dest_mask, src_mask, mask_mask, store_mask;
    982 
    983     COMPUTE_SHIFT_MASKC (dest, src, mask);
    984 
    985     /* printf ("%s\n",__PRETTY_FUNCTION__); */
    986     for (i = width / 4; i > 0; i--)
    987     {
    988 	LOAD_VECTORSM (dest, src, mask);
    989 
    990 	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
    991 			     vdest, splat_alpha (negate (vsrc)));
    992 
    993 	STORE_VECTOR (dest);
    994 
    995 	src += 4;
    996 	dest += 4;
    997 	mask += 4;
    998     }
    999 
   1000     for (i = width % 4; --i >= 0;)
   1001     {
   1002 	uint32_t m = ALPHA_8 (mask[i]);
   1003 	uint32_t s = src[i];
   1004 	uint32_t d = dest[i];
   1005 	uint32_t src_ia;
   1006 	uint32_t dest_ia = ALPHA_8 (~d);
   1007 
   1008 	UN8x4_MUL_UN8 (s, m);
   1009 
   1010 	src_ia = ALPHA_8 (~s);
   1011 
   1012 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
   1013 
   1014 	dest[i] = s;
   1015     }
   1016 }
   1017 
   1018 static void
   1019 vmx_combine_xor_u (pixman_implementation_t *imp,
   1020                    pixman_op_t              op,
   1021                    uint32_t *               dest,
   1022                    const uint32_t *         src,
   1023                    const uint32_t *         mask,
   1024                    int                      width)
   1025 {
   1026     if (mask)
   1027 	vmx_combine_xor_u_mask (dest, src, mask, width);
   1028     else
   1029 	vmx_combine_xor_u_no_mask (dest, src, width);
   1030 }
   1031 
   1032 static void
   1033 vmx_combine_add_u_no_mask (uint32_t *      dest,
   1034                            const uint32_t *src,
   1035                            int             width)
   1036 {
   1037     int i;
   1038     vector unsigned int vdest, vsrc;
   1039     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1040 	dest_mask, src_mask, store_mask;
   1041 
   1042     COMPUTE_SHIFT_MASKS (dest, src);
   1043     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1044     for (i = width / 4; i > 0; i--)
   1045     {
   1046 	LOAD_VECTORS (dest, src);
   1047 
   1048 	vdest = pix_add (vsrc, vdest);
   1049 
   1050 	STORE_VECTOR (dest);
   1051 
   1052 	src += 4;
   1053 	dest += 4;
   1054     }
   1055 
   1056     for (i = width % 4; --i >= 0;)
   1057     {
   1058 	uint32_t s = src[i];
   1059 	uint32_t d = dest[i];
   1060 
   1061 	UN8x4_ADD_UN8x4 (d, s);
   1062 
   1063 	dest[i] = d;
   1064     }
   1065 }
   1066 
   1067 static void
   1068 vmx_combine_add_u_mask (uint32_t *      dest,
   1069                         const uint32_t *src,
   1070                         const uint32_t *mask,
   1071                         int             width)
   1072 {
   1073     int i;
   1074     vector unsigned int vdest, vsrc, vmask;
   1075     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1076 	dest_mask, src_mask, mask_mask, store_mask;
   1077 
   1078     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1079 
   1080     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1081     for (i = width / 4; i > 0; i--)
   1082     {
   1083 	LOAD_VECTORSM (dest, src, mask);
   1084 
   1085 	vdest = pix_add (vsrc, vdest);
   1086 
   1087 	STORE_VECTOR (dest);
   1088 
   1089 	src += 4;
   1090 	dest += 4;
   1091 	mask += 4;
   1092     }
   1093 
   1094     for (i = width % 4; --i >= 0;)
   1095     {
   1096 	uint32_t m = ALPHA_8 (mask[i]);
   1097 	uint32_t s = src[i];
   1098 	uint32_t d = dest[i];
   1099 
   1100 	UN8x4_MUL_UN8 (s, m);
   1101 	UN8x4_ADD_UN8x4 (d, s);
   1102 
   1103 	dest[i] = d;
   1104     }
   1105 }
   1106 
   1107 static void
   1108 vmx_combine_add_u (pixman_implementation_t *imp,
   1109                    pixman_op_t              op,
   1110                    uint32_t *               dest,
   1111                    const uint32_t *         src,
   1112                    const uint32_t *         mask,
   1113                    int                      width)
   1114 {
   1115     if (mask)
   1116 	vmx_combine_add_u_mask (dest, src, mask, width);
   1117     else
   1118 	vmx_combine_add_u_no_mask (dest, src, width);
   1119 }
   1120 
   1121 static void
   1122 vmx_combine_src_ca (pixman_implementation_t *imp,
   1123                     pixman_op_t              op,
   1124                     uint32_t *               dest,
   1125                     const uint32_t *         src,
   1126                     const uint32_t *         mask,
   1127                     int                      width)
   1128 {
   1129     int i;
   1130     vector unsigned int vdest, vsrc, vmask;
   1131     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1132 	dest_mask, mask_mask, src_mask, store_mask;
   1133 
   1134     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1135 
   1136     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1137     for (i = width / 4; i > 0; i--)
   1138     {
   1139 	LOAD_VECTORSC (dest, src, mask);
   1140 
   1141 	vdest = pix_multiply (vsrc, vmask);
   1142 
   1143 	STORE_VECTOR (dest);
   1144 
   1145 	mask += 4;
   1146 	src += 4;
   1147 	dest += 4;
   1148     }
   1149 
   1150     for (i = width % 4; --i >= 0;)
   1151     {
   1152 	uint32_t a = mask[i];
   1153 	uint32_t s = src[i];
   1154 
   1155 	UN8x4_MUL_UN8x4 (s, a);
   1156 
   1157 	dest[i] = s;
   1158     }
   1159 }
   1160 
   1161 static void
   1162 vmx_combine_over_ca (pixman_implementation_t *imp,
   1163                      pixman_op_t              op,
   1164                      uint32_t *               dest,
   1165                      const uint32_t *         src,
   1166                      const uint32_t *         mask,
   1167                      int                      width)
   1168 {
   1169     int i;
   1170     vector unsigned int vdest, vsrc, vmask;
   1171     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1172 	dest_mask, mask_mask, src_mask, store_mask;
   1173 
   1174     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1175 
   1176     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1177     for (i = width / 4; i > 0; i--)
   1178     {
   1179 	LOAD_VECTORSC (dest, src, mask);
   1180 
   1181 	vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
   1182 
   1183 	STORE_VECTOR (dest);
   1184 
   1185 	mask += 4;
   1186 	src += 4;
   1187 	dest += 4;
   1188     }
   1189 
   1190     for (i = width % 4; --i >= 0;)
   1191     {
   1192 	uint32_t a = mask[i];
   1193 	uint32_t s = src[i];
   1194 	uint32_t d = dest[i];
   1195 	uint32_t sa = ALPHA_8 (s);
   1196 
   1197 	UN8x4_MUL_UN8x4 (s, a);
   1198 	UN8x4_MUL_UN8 (a, sa);
   1199 	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
   1200 
   1201 	dest[i] = d;
   1202     }
   1203 }
   1204 
   1205 static void
   1206 vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
   1207                              pixman_op_t              op,
   1208                              uint32_t *               dest,
   1209                              const uint32_t *         src,
   1210                              const uint32_t *         mask,
   1211                              int                      width)
   1212 {
   1213     int i;
   1214     vector unsigned int vdest, vsrc, vmask;
   1215     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1216 	dest_mask, mask_mask, src_mask, store_mask;
   1217 
   1218     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1219 
   1220     /* printf("%s\n",__PRETTY_FUNCTION__); */
   1221     for (i = width / 4; i > 0; i--)
   1222     {
   1223 	LOAD_VECTORSC (dest, src, mask);
   1224 
   1225 	vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
   1226 
   1227 	STORE_VECTOR (dest);
   1228 
   1229 	mask += 4;
   1230 	src += 4;
   1231 	dest += 4;
   1232     }
   1233 
   1234     for (i = width % 4; --i >= 0;)
   1235     {
   1236 	uint32_t a = mask[i];
   1237 	uint32_t s = src[i];
   1238 	uint32_t d = dest[i];
   1239 	uint32_t ida = ALPHA_8 (~d);
   1240 
   1241 	UN8x4_MUL_UN8x4 (s, a);
   1242 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
   1243 
   1244 	dest[i] = s;
   1245     }
   1246 }
   1247 
   1248 static void
   1249 vmx_combine_in_ca (pixman_implementation_t *imp,
   1250                    pixman_op_t              op,
   1251                    uint32_t *               dest,
   1252                    const uint32_t *         src,
   1253                    const uint32_t *         mask,
   1254                    int                      width)
   1255 {
   1256     int i;
   1257     vector unsigned int vdest, vsrc, vmask;
   1258     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1259 	dest_mask, mask_mask, src_mask, store_mask;
   1260 
   1261     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1262 
   1263     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1264     for (i = width / 4; i > 0; i--)
   1265     {
   1266 	LOAD_VECTORSC (dest, src, mask);
   1267 
   1268 	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
   1269 
   1270 	STORE_VECTOR (dest);
   1271 
   1272 	src += 4;
   1273 	dest += 4;
   1274 	mask += 4;
   1275     }
   1276 
   1277     for (i = width % 4; --i >= 0;)
   1278     {
   1279 	uint32_t a = mask[i];
   1280 	uint32_t s = src[i];
   1281 	uint32_t da = ALPHA_8 (dest[i]);
   1282 
   1283 	UN8x4_MUL_UN8x4 (s, a);
   1284 	UN8x4_MUL_UN8 (s, da);
   1285 
   1286 	dest[i] = s;
   1287     }
   1288 }
   1289 
   1290 static void
   1291 vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
   1292                            pixman_op_t              op,
   1293                            uint32_t *               dest,
   1294                            const uint32_t *         src,
   1295                            const uint32_t *         mask,
   1296                            int                      width)
   1297 {
   1298     int i;
   1299     vector unsigned int vdest, vsrc, vmask;
   1300     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1301 	dest_mask, mask_mask, src_mask, store_mask;
   1302 
   1303     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1304 
   1305     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1306     for (i = width / 4; i > 0; i--)
   1307     {
   1308 
   1309 	LOAD_VECTORSC (dest, src, mask);
   1310 
   1311 	vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
   1312 
   1313 	STORE_VECTOR (dest);
   1314 
   1315 	src += 4;
   1316 	dest += 4;
   1317 	mask += 4;
   1318     }
   1319 
   1320     for (i = width % 4; --i >= 0;)
   1321     {
   1322 	uint32_t a = mask[i];
   1323 	uint32_t d = dest[i];
   1324 	uint32_t sa = ALPHA_8 (src[i]);
   1325 
   1326 	UN8x4_MUL_UN8 (a, sa);
   1327 	UN8x4_MUL_UN8x4 (d, a);
   1328 
   1329 	dest[i] = d;
   1330     }
   1331 }
   1332 
   1333 static void
   1334 vmx_combine_out_ca (pixman_implementation_t *imp,
   1335                     pixman_op_t              op,
   1336                     uint32_t *               dest,
   1337                     const uint32_t *         src,
   1338                     const uint32_t *         mask,
   1339                     int                      width)
   1340 {
   1341     int i;
   1342     vector unsigned int vdest, vsrc, vmask;
   1343     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1344 	dest_mask, mask_mask, src_mask, store_mask;
   1345 
   1346     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1347 
   1348     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1349     for (i = width / 4; i > 0; i--)
   1350     {
   1351 	LOAD_VECTORSC (dest, src, mask);
   1352 
   1353 	vdest = pix_multiply (
   1354 	    pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
   1355 
   1356 	STORE_VECTOR (dest);
   1357 
   1358 	src += 4;
   1359 	dest += 4;
   1360 	mask += 4;
   1361     }
   1362 
   1363     for (i = width % 4; --i >= 0;)
   1364     {
   1365 	uint32_t a = mask[i];
   1366 	uint32_t s = src[i];
   1367 	uint32_t d = dest[i];
   1368 	uint32_t da = ALPHA_8 (~d);
   1369 
   1370 	UN8x4_MUL_UN8x4 (s, a);
   1371 	UN8x4_MUL_UN8 (s, da);
   1372 
   1373 	dest[i] = s;
   1374     }
   1375 }
   1376 
   1377 static void
   1378 vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
   1379                             pixman_op_t              op,
   1380                             uint32_t *               dest,
   1381                             const uint32_t *         src,
   1382                             const uint32_t *         mask,
   1383                             int                      width)
   1384 {
   1385     int i;
   1386     vector unsigned int vdest, vsrc, vmask;
   1387     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1388 	dest_mask, mask_mask, src_mask, store_mask;
   1389 
   1390     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1391 
   1392     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1393     for (i = width / 4; i > 0; i--)
   1394     {
   1395 	LOAD_VECTORSC (dest, src, mask);
   1396 
   1397 	vdest = pix_multiply (
   1398 	    vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
   1399 
   1400 	STORE_VECTOR (dest);
   1401 
   1402 	src += 4;
   1403 	dest += 4;
   1404 	mask += 4;
   1405     }
   1406 
   1407     for (i = width % 4; --i >= 0;)
   1408     {
   1409 	uint32_t a = mask[i];
   1410 	uint32_t s = src[i];
   1411 	uint32_t d = dest[i];
   1412 	uint32_t sa = ALPHA_8 (s);
   1413 
   1414 	UN8x4_MUL_UN8 (a, sa);
   1415 	UN8x4_MUL_UN8x4 (d, ~a);
   1416 
   1417 	dest[i] = d;
   1418     }
   1419 }
   1420 
   1421 static void
   1422 vmx_combine_atop_ca (pixman_implementation_t *imp,
   1423                      pixman_op_t              op,
   1424                      uint32_t *               dest,
   1425                      const uint32_t *         src,
   1426                      const uint32_t *         mask,
   1427                      int                      width)
   1428 {
   1429     int i;
   1430     vector unsigned int vdest, vsrc, vmask, vsrca;
   1431     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1432 	dest_mask, mask_mask, src_mask, store_mask;
   1433 
   1434     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1435 
   1436     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1437     for (i = width / 4; i > 0; i--)
   1438     {
   1439 	LOAD_VECTORSC (dest, src, mask);
   1440 
   1441 	vsrca = splat_alpha (vsrc);
   1442 
   1443 	vsrc = pix_multiply (vsrc, vmask);
   1444 	vmask = pix_multiply (vmask, vsrca);
   1445 
   1446 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
   1447 			     negate (vmask), vdest);
   1448 
   1449 	STORE_VECTOR (dest);
   1450 
   1451 	src += 4;
   1452 	dest += 4;
   1453 	mask += 4;
   1454     }
   1455 
   1456     for (i = width % 4; --i >= 0;)
   1457     {
   1458 	uint32_t a = mask[i];
   1459 	uint32_t s = src[i];
   1460 	uint32_t d = dest[i];
   1461 	uint32_t sa = ALPHA_8 (s);
   1462 	uint32_t da = ALPHA_8 (d);
   1463 
   1464 	UN8x4_MUL_UN8x4 (s, a);
   1465 	UN8x4_MUL_UN8 (a, sa);
   1466 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
   1467 
   1468 	dest[i] = d;
   1469     }
   1470 }
   1471 
   1472 static void
   1473 vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
   1474                              pixman_op_t              op,
   1475                              uint32_t *               dest,
   1476                              const uint32_t *         src,
   1477                              const uint32_t *         mask,
   1478                              int                      width)
   1479 {
   1480     int i;
   1481     vector unsigned int vdest, vsrc, vmask;
   1482     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1483 	dest_mask, mask_mask, src_mask, store_mask;
   1484 
   1485     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1486 
   1487     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1488     for (i = width / 4; i > 0; i--)
   1489     {
   1490 	LOAD_VECTORSC (dest, src, mask);
   1491 
   1492 	vdest = pix_add_mul (vdest,
   1493 			     pix_multiply (vmask, splat_alpha (vsrc)),
   1494 			     pix_multiply (vsrc, vmask),
   1495 			     negate (splat_alpha (vdest)));
   1496 
   1497 	STORE_VECTOR (dest);
   1498 
   1499 	src += 4;
   1500 	dest += 4;
   1501 	mask += 4;
   1502     }
   1503 
   1504     for (i = width % 4; --i >= 0;)
   1505     {
   1506 	uint32_t a = mask[i];
   1507 	uint32_t s = src[i];
   1508 	uint32_t d = dest[i];
   1509 	uint32_t sa = ALPHA_8 (s);
   1510 	uint32_t da = ALPHA_8 (~d);
   1511 
   1512 	UN8x4_MUL_UN8x4 (s, a);
   1513 	UN8x4_MUL_UN8 (a, sa);
   1514 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
   1515 
   1516 	dest[i] = d;
   1517     }
   1518 }
   1519 
   1520 static void
   1521 vmx_combine_xor_ca (pixman_implementation_t *imp,
   1522                     pixman_op_t              op,
   1523                     uint32_t *               dest,
   1524                     const uint32_t *         src,
   1525                     const uint32_t *         mask,
   1526                     int                      width)
   1527 {
   1528     int i;
   1529     vector unsigned int vdest, vsrc, vmask;
   1530     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1531 	dest_mask, mask_mask, src_mask, store_mask;
   1532 
   1533     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1534 
   1535     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1536     for (i = width / 4; i > 0; i--)
   1537     {
   1538 	LOAD_VECTORSC (dest, src, mask);
   1539 
   1540 	vdest = pix_add_mul (vdest,
   1541 			     negate (pix_multiply (vmask, splat_alpha (vsrc))),
   1542 			     pix_multiply (vsrc, vmask),
   1543 			     negate (splat_alpha (vdest)));
   1544 
   1545 	STORE_VECTOR (dest);
   1546 
   1547 	src += 4;
   1548 	dest += 4;
   1549 	mask += 4;
   1550     }
   1551 
   1552     for (i = width % 4; --i >= 0;)
   1553     {
   1554 	uint32_t a = mask[i];
   1555 	uint32_t s = src[i];
   1556 	uint32_t d = dest[i];
   1557 	uint32_t sa = ALPHA_8 (s);
   1558 	uint32_t da = ALPHA_8 (~d);
   1559 
   1560 	UN8x4_MUL_UN8x4 (s, a);
   1561 	UN8x4_MUL_UN8 (a, sa);
   1562 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
   1563 
   1564 	dest[i] = d;
   1565     }
   1566 }
   1567 
   1568 static void
   1569 vmx_combine_add_ca (pixman_implementation_t *imp,
   1570                     pixman_op_t              op,
   1571                     uint32_t *               dest,
   1572                     const uint32_t *         src,
   1573                     const uint32_t *         mask,
   1574                     int                      width)
   1575 {
   1576     int i;
   1577     vector unsigned int vdest, vsrc, vmask;
   1578     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   1579 	dest_mask, mask_mask, src_mask, store_mask;
   1580 
   1581     COMPUTE_SHIFT_MASKC (dest, src, mask);
   1582 
   1583     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1584     for (i = width / 4; i > 0; i--)
   1585     {
   1586 	LOAD_VECTORSC (dest, src, mask);
   1587 
   1588 	vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
   1589 
   1590 	STORE_VECTOR (dest);
   1591 
   1592 	src += 4;
   1593 	dest += 4;
   1594 	mask += 4;
   1595     }
   1596 
   1597     for (i = width % 4; --i >= 0;)
   1598     {
   1599 	uint32_t a = mask[i];
   1600 	uint32_t s = src[i];
   1601 	uint32_t d = dest[i];
   1602 
   1603 	UN8x4_MUL_UN8x4 (s, a);
   1604 	UN8x4_ADD_UN8x4 (s, d);
   1605 
   1606 	dest[i] = s;
   1607     }
   1608 }
   1609 
   1610 static const pixman_fast_path_t vmx_fast_paths[] =
   1611 {
   1612     {   PIXMAN_OP_NONE	},
   1613 };
   1614 
   1615 pixman_implementation_t *
   1616 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
   1617 {
   1618     pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
   1619 
   1620     /* Set up function pointers */
   1621 
   1622     imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
   1623     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
   1624     imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
   1625     imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
   1626     imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
   1627     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
   1628     imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
   1629     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
   1630     imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
   1631 
   1632     imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
   1633 
   1634     imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
   1635     imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
   1636     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
   1637     imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
   1638     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
   1639     imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
   1640     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
   1641     imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
   1642     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
   1643     imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
   1644     imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
   1645 
   1646     return imp;
   1647 }
   1648