Home | History | Annotate | Download | only in video
      1 /*
      2     SDL - Simple DirectMedia Layer
      3     Copyright (C) 1997-2012 Sam Lantinga
      4 
      5     This library is free software; you can redistribute it and/or
      6     modify it under the terms of the GNU Lesser General Public
      7     License as published by the Free Software Foundation; either
      8     version 2.1 of the License, or (at your option) any later version.
      9 
     10     This library is distributed in the hope that it will be useful,
     11     but WITHOUT ANY WARRANTY; without even the implied warranty of
     12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13     Lesser General Public License for more details.
     14 
     15     You should have received a copy of the GNU Lesser General Public
     16     License along with this library; if not, write to the Free Software
     17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
     18 
     19     Sam Lantinga
     20     slouken (at) libsdl.org
     21 */
     22 #include "SDL_config.h"
     23 
     24 #include "SDL_video.h"
     25 #include "SDL_blit.h"
     26 
     27 /*
     28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
     29    Checking if _mm_free is #defined in malloc.h is is the only way to
     30    determine if the Processor Pack is installed, as far as I can tell.
     31 */
     32 
     33 #if SDL_ASSEMBLY_ROUTINES
     34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
     35      /* forced MMX to 0...it breaks on most compilers now.  --ryan. */
     36 #    define MMX_ASMBLIT 0
     37 #    define GCC_ASMBLIT 0
     38 #  elif defined(_MSC_VER) && defined(_M_IX86)
     39 #    if (_MSC_VER <= 1200)
     40 #      include <malloc.h>
     41 #      if defined(_mm_free)
     42 #          define HAVE_MMINTRIN_H 1
     43 #      endif
     44 #    else  /* Visual Studio > VC6 always has mmintrin.h */
     45 #      define HAVE_MMINTRIN_H 1
     46 #    endif
     47 #    if HAVE_MMINTRIN_H
     48 #      define MMX_ASMBLIT 1
     49 #      define MSVC_ASMBLIT 1
     50 #    endif
     51 #  endif
     52 #endif /* SDL_ASSEMBLY_ROUTINES */
     53 
     54 /* Function to check the CPU flags */
     55 #include "SDL_cpuinfo.h"
     56 #if GCC_ASMBLIT
     57 #include "mmx.h"
     58 #elif MSVC_ASMBLIT
     59 #include <mmintrin.h>
     60 #include <mm3dnow.h>
     61 #endif
     62 
     63 /* Functions to perform alpha blended blitting */
     64 
     65 /* N->1 blending with per-surface alpha */
     66 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
     67 {
     68 	int width = info->d_width;
     69 	int height = info->d_height;
     70 	Uint8 *src = info->s_pixels;
     71 	int srcskip = info->s_skip;
     72 	Uint8 *dst = info->d_pixels;
     73 	int dstskip = info->d_skip;
     74 	Uint8 *palmap = info->table;
     75 	SDL_PixelFormat *srcfmt = info->src;
     76 	SDL_PixelFormat *dstfmt = info->dst;
     77 	int srcbpp = srcfmt->BytesPerPixel;
     78 
     79 	const unsigned A = srcfmt->alpha;
     80 
     81 	while ( height-- ) {
     82 	    DUFFS_LOOP4(
     83 	    {
     84 		Uint32 Pixel;
     85 		unsigned sR;
     86 		unsigned sG;
     87 		unsigned sB;
     88 		unsigned dR;
     89 		unsigned dG;
     90 		unsigned dB;
     91 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
     92 		dR = dstfmt->palette->colors[*dst].r;
     93 		dG = dstfmt->palette->colors[*dst].g;
     94 		dB = dstfmt->palette->colors[*dst].b;
     95 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
     96 		dR &= 0xff;
     97 		dG &= 0xff;
     98 		dB &= 0xff;
     99 		/* Pack RGB into 8bit pixel */
    100 		if ( palmap == NULL ) {
    101 		    *dst =((dR>>5)<<(3+2))|
    102 			  ((dG>>5)<<(2))|
    103 			  ((dB>>6)<<(0));
    104 		} else {
    105 		    *dst = palmap[((dR>>5)<<(3+2))|
    106 				  ((dG>>5)<<(2))  |
    107 				  ((dB>>6)<<(0))];
    108 		}
    109 		dst++;
    110 		src += srcbpp;
    111 	    },
    112 	    width);
    113 	    src += srcskip;
    114 	    dst += dstskip;
    115 	}
    116 }
    117 
    118 /* N->1 blending with pixel alpha */
    119 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
    120 {
    121 	int width = info->d_width;
    122 	int height = info->d_height;
    123 	Uint8 *src = info->s_pixels;
    124 	int srcskip = info->s_skip;
    125 	Uint8 *dst = info->d_pixels;
    126 	int dstskip = info->d_skip;
    127 	Uint8 *palmap = info->table;
    128 	SDL_PixelFormat *srcfmt = info->src;
    129 	SDL_PixelFormat *dstfmt = info->dst;
    130 	int srcbpp = srcfmt->BytesPerPixel;
    131 
    132 	/* FIXME: fix alpha bit field expansion here too? */
    133 	while ( height-- ) {
    134 	    DUFFS_LOOP4(
    135 	    {
    136 		Uint32 Pixel;
    137 		unsigned sR;
    138 		unsigned sG;
    139 		unsigned sB;
    140 		unsigned sA;
    141 		unsigned dR;
    142 		unsigned dG;
    143 		unsigned dB;
    144 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
    145 		dR = dstfmt->palette->colors[*dst].r;
    146 		dG = dstfmt->palette->colors[*dst].g;
    147 		dB = dstfmt->palette->colors[*dst].b;
    148 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
    149 		dR &= 0xff;
    150 		dG &= 0xff;
    151 		dB &= 0xff;
    152 		/* Pack RGB into 8bit pixel */
    153 		if ( palmap == NULL ) {
    154 		    *dst =((dR>>5)<<(3+2))|
    155 			  ((dG>>5)<<(2))|
    156 			  ((dB>>6)<<(0));
    157 		} else {
    158 		    *dst = palmap[((dR>>5)<<(3+2))|
    159 				  ((dG>>5)<<(2))  |
    160 				  ((dB>>6)<<(0))  ];
    161 		}
    162 		dst++;
    163 		src += srcbpp;
    164 	    },
    165 	    width);
    166 	    src += srcskip;
    167 	    dst += dstskip;
    168 	}
    169 }
    170 
    171 /* colorkeyed N->1 blending with per-surface alpha */
    172 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
    173 {
    174 	int width = info->d_width;
    175 	int height = info->d_height;
    176 	Uint8 *src = info->s_pixels;
    177 	int srcskip = info->s_skip;
    178 	Uint8 *dst = info->d_pixels;
    179 	int dstskip = info->d_skip;
    180 	Uint8 *palmap = info->table;
    181 	SDL_PixelFormat *srcfmt = info->src;
    182 	SDL_PixelFormat *dstfmt = info->dst;
    183 	int srcbpp = srcfmt->BytesPerPixel;
    184 	Uint32 ckey = srcfmt->colorkey;
    185 
    186 	const int A = srcfmt->alpha;
    187 
    188 	while ( height-- ) {
    189 	    DUFFS_LOOP(
    190 	    {
    191 		Uint32 Pixel;
    192 		unsigned sR;
    193 		unsigned sG;
    194 		unsigned sB;
    195 		unsigned dR;
    196 		unsigned dG;
    197 		unsigned dB;
    198 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    199 		if ( Pixel != ckey ) {
    200 		    dR = dstfmt->palette->colors[*dst].r;
    201 		    dG = dstfmt->palette->colors[*dst].g;
    202 		    dB = dstfmt->palette->colors[*dst].b;
    203 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    204 		    dR &= 0xff;
    205 		    dG &= 0xff;
    206 		    dB &= 0xff;
    207 		    /* Pack RGB into 8bit pixel */
    208 		    if ( palmap == NULL ) {
    209 			*dst =((dR>>5)<<(3+2))|
    210 			      ((dG>>5)<<(2)) |
    211 			      ((dB>>6)<<(0));
    212 		    } else {
    213 			*dst = palmap[((dR>>5)<<(3+2))|
    214 				      ((dG>>5)<<(2))  |
    215 				      ((dB>>6)<<(0))  ];
    216 		    }
    217 		}
    218 		dst++;
    219 		src += srcbpp;
    220 	    },
    221 	    width);
    222 	    src += srcskip;
    223 	    dst += dstskip;
    224 	}
    225 }
    226 
    227 #if GCC_ASMBLIT
    228 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    229 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
    230 {
    231 	int width = info->d_width;
    232 	int height = info->d_height;
    233 	Uint32 *srcp = (Uint32 *)info->s_pixels;
    234 	int srcskip = info->s_skip >> 2;
    235 	Uint32 *dstp = (Uint32 *)info->d_pixels;
    236 	int dstskip = info->d_skip >> 2;
    237 	Uint32 dalpha = info->dst->Amask;
    238 	Uint64 load;
    239 
    240 	load = 0x00fefefe00fefefeULL;/* alpha128 mask */
    241 	movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
    242 	load = 0x0001010100010101ULL;/* !alpha128 mask */
    243 	movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
    244 	movd_m2r(dalpha, mm7); /* dst alpha mask */
    245 	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
    246 	while(height--) {
    247 		DUFFS_LOOP_DOUBLE2(
    248 		{
    249 			Uint32 s = *srcp++;
    250 			Uint32 d = *dstp;
    251 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    252 				   + (s & d & 0x00010101)) | dalpha;
    253 		},{
    254 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
    255 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
    256 
    257 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
    258 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
    259 
    260 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
    261 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
    262 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
    263 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
    264 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
    265 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
    266 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
    267 
    268 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
    269 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
    270 			dstp += 2;
    271 			srcp += 2;
    272 		}, width);
    273 		srcp += srcskip;
    274 		dstp += dstskip;
    275 	}
    276 	emms();
    277 }
    278 
    279 /* fast RGB888->(A)RGB888 blending with surface alpha */
    280 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
    281 {
    282 	SDL_PixelFormat* df = info->dst;
    283 	unsigned alpha = info->src->alpha;
    284 
    285 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
    286 			/* only call a128 version when R,G,B occupy lower bits */
    287 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
    288 	} else {
    289 		int width = info->d_width;
    290 		int height = info->d_height;
    291 		Uint32 *srcp = (Uint32 *)info->s_pixels;
    292 		int srcskip = info->s_skip >> 2;
    293 		Uint32 *dstp = (Uint32 *)info->d_pixels;
    294 		int dstskip = info->d_skip >> 2;
    295 
    296 		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
    297 		/* form the alpha mult */
    298 		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
    299 		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
    300 		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
    301 		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
    302 		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
    303 		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
    304 		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
    305 			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
    306 		movd_m2r(df->Amask, mm7); /* dst alpha mask */
    307 		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
    308 
    309 		while(height--) {
    310 			DUFFS_LOOP_DOUBLE2({
    311 				/* One Pixel Blend */
    312 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
    313 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
    314 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
    315 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
    316 
    317 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
    318 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
    319 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
    320 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
    321 
    322 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
    323 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
    324 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
    325 				++srcp;
    326 				++dstp;
    327 			},{
    328 				/* Two Pixels Blend */
    329 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
    330 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
    331 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
    332 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
    333 
    334 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
    335 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
    336 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
    337 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
    338 
    339 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
    340 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
    341 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
    342 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
    343 
    344 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
    345 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
    346 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
    347 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
    348 
    349 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
    350 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
    351 
    352 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
    353 
    354   				srcp += 2;
    355   				dstp += 2;
    356   			}, width);
    357 			srcp += srcskip;
    358 			dstp += dstskip;
    359 		}
    360 		emms();
    361 	}
    362 }
    363 
    364 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
    365 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
    366 {
    367 	int width = info->d_width;
    368 	int height = info->d_height;
    369 	Uint32 *srcp = (Uint32 *)info->s_pixels;
    370 	int srcskip = info->s_skip >> 2;
    371 	Uint32 *dstp = (Uint32 *)info->d_pixels;
    372 	int dstskip = info->d_skip >> 2;
    373 	SDL_PixelFormat* sf = info->src;
    374 	Uint32 amask = sf->Amask;
    375 
    376 	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
    377 	/* form multiplication mask */
    378 	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
    379 	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
    380 	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
    381 	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
    382 	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
    383 	/* form channel masks */
    384 	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
    385 	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
    386 	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
    387 	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
    388 	/* get alpha channel shift */
    389 	__asm__ __volatile__ (
    390 		"movd %0, %%mm5"
    391 		: : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
    392 
    393 	while(height--) {
    394 	    DUFFS_LOOP4({
    395 		Uint32 alpha = *srcp & amask;
    396 		/* FIXME: Here we special-case opaque alpha since the
    397 			compositioning used (>>8 instead of /255) doesn't handle
    398 			it correctly. Also special-case alpha=0 for speed?
    399 			Benchmark this! */
    400 		if(alpha == 0) {
    401 			/* do nothing */
    402 		} else if(alpha == amask) {
    403 			/* opaque alpha -- copy RGB, keep dst alpha */
    404 			/* using MMX here to free up regular registers for other things */
    405 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
    406 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
    407 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
    408 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
    409 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
    410 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
    411 		} else {
    412 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
    413 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
    414 
    415 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
    416 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
    417 
    418 			__asm__ __volatile__ (
    419 				"movd %0, %%mm4"
    420 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
    421 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
    422 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
    423 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
    424 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
    425 
    426 			/* blend */
    427 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
    428 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
    429 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
    430 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
    431 
    432 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
    433 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
    434 		}
    435 		++srcp;
    436 		++dstp;
    437 	    }, width);
    438 	    srcp += srcskip;
    439 	    dstp += dstskip;
    440 	}
    441 	emms();
    442 }
    443 /* End GCC_ASMBLIT */
    444 
    445 #elif MSVC_ASMBLIT
    446 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    447 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
    448 {
    449 	int width = info->d_width;
    450 	int height = info->d_height;
    451 	Uint32 *srcp = (Uint32 *)info->s_pixels;
    452 	int srcskip = info->s_skip >> 2;
    453 	Uint32 *dstp = (Uint32 *)info->d_pixels;
    454 	int dstskip = info->d_skip >> 2;
    455 	Uint32 dalpha = info->dst->Amask;
    456 
    457 	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
    458 
    459 	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
    460 	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
    461 	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
    462 
    463 	while (height--) {
    464 		int n = width;
    465 		if ( n & 1 ) {
    466 			Uint32 s = *srcp++;
    467 			Uint32 d = *dstp;
    468 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    469 				   + (s & d & 0x00010101)) | dalpha;
    470 			n--;
    471 		}
    472 
    473 		for (n >>= 1; n > 0; --n) {
    474 			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
    475 			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
    476 
    477 			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
    478 			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
    479 
    480 			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
    481 			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
    482 			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
    483 			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
    484 
    485 			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
    486 			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
    487 			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
    488 			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
    489 
    490 			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
    491 			dstp += 2;
    492 			srcp += 2;
    493 		}
    494 
    495 		srcp += srcskip;
    496 		dstp += dstskip;
    497 	}
    498 	_mm_empty();
    499 }
    500 
    501 /* fast RGB888->(A)RGB888 blending with surface alpha */
    502 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
    503 {
    504 	SDL_PixelFormat* df = info->dst;
    505 	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
    506 	unsigned alpha = info->src->alpha;
    507 
    508 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
    509 			/* only call a128 version when R,G,B occupy lower bits */
    510 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
    511 	} else {
    512 		int width = info->d_width;
    513 		int height = info->d_height;
    514 		Uint32 *srcp = (Uint32 *)info->s_pixels;
    515 		int srcskip = info->s_skip >> 2;
    516 		Uint32 *dstp = (Uint32 *)info->d_pixels;
    517 		int dstskip = info->d_skip >> 2;
    518 		Uint32 dalpha = df->Amask;
    519 		Uint32 amult;
    520 
    521 		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
    522 
    523 		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
    524 		/* form the alpha mult */
    525 		amult = alpha | (alpha << 8);
    526 		amult = amult | (amult << 16);
    527 		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
    528 		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
    529 		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
    530 			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
    531 		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
    532 
    533 		while (height--) {
    534 			int n = width;
    535 			if (n & 1) {
    536 				/* One Pixel Blend */
    537 				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
    538 				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
    539 
    540 				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
    541 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
    542 
    543 				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
    544 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    545 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
    546 				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
    547 
    548 				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
    549 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
    550 				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    551 
    552 				++srcp;
    553 				++dstp;
    554 
    555 				n--;
    556 			}
    557 
    558 			for (n >>= 1; n > 0; --n) {
    559 				/* Two Pixels Blend */
    560 				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
    561 				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
    562 				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
    563 				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
    564 
    565 				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
    566 				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
    567 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
    568 				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
    569 
    570 				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
    571 				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
    572 				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
    573 				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
    574 
    575 				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
    576 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    577 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
    578 				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
    579 
    580 				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
    581 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
    582 
    583 				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
    584 
    585 				srcp += 2;
    586 				dstp += 2;
    587 			}
    588 			srcp += srcskip;
    589 			dstp += dstskip;
    590 		}
    591 		_mm_empty();
    592 	}
    593 }
    594 
    595 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
    596 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
    597 {
    598 	int width = info->d_width;
    599 	int height = info->d_height;
    600 	Uint32 *srcp = (Uint32 *)info->s_pixels;
    601 	int srcskip = info->s_skip >> 2;
    602 	Uint32 *dstp = (Uint32 *)info->d_pixels;
    603 	int dstskip = info->d_skip >> 2;
    604 	SDL_PixelFormat* sf = info->src;
    605 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
    606 	Uint32 amask = sf->Amask;
    607 	Uint32 ashift = sf->Ashift;
    608 	Uint64 multmask;
    609 
    610 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
    611 
    612 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
    613 	multmask = ~(0xFFFFi64 << (ashift * 2));
    614 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
    615 
    616 	while(height--) {
    617 		DUFFS_LOOP4({
    618 		Uint32 alpha = *srcp & amask;
    619 		if (alpha == 0) {
    620 			/* do nothing */
    621 		} else if (alpha == amask) {
    622 			/* opaque alpha -- copy RGB, keep dst alpha */
    623 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
    624 		} else {
    625 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
    626 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
    627 
    628 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
    629 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
    630 
    631 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
    632 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
    633 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
    634 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
    635 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
    636 
    637 			/* blend */
    638 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
    639 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
    640 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
    641 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
    642 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
    643 
    644 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    645 		}
    646 		++srcp;
    647 		++dstp;
    648 	    }, width);
    649 	    srcp += srcskip;
    650 	    dstp += dstskip;
    651 	}
    652 	_mm_empty();
    653 }
    654 /* End MSVC_ASMBLIT */
    655 
    656 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
    657 
    658 #if SDL_ALTIVEC_BLITTERS
    659 #if __MWERKS__
    660 #pragma altivec_model on
    661 #endif
    662 #if HAVE_ALTIVEC_H
    663 #include <altivec.h>
    664 #endif
    665 #include <assert.h>
    666 
    667 #if (defined(__MACOSX__) && (__GNUC__ < 4))
    668     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
    669         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
    670     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
    671         (vector unsigned short) ( a,b,c,d,e,f,g,h )
    672 #else
    673     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
    674         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
    675     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
    676         (vector unsigned short) { a,b,c,d,e,f,g,h }
    677 #endif
    678 
    679 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
    680 #define VECPRINT(msg, v) do { \
    681     vector unsigned int tmpvec = (vector unsigned int)(v); \
    682     unsigned int *vp = (unsigned int *)&tmpvec; \
    683     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
    684 } while (0)
    685 
    686 /* the permuation vector that takes the high bytes out of all the appropriate shorts
    687     (vector unsigned char)(
    688         0x00, 0x10, 0x02, 0x12,
    689         0x04, 0x14, 0x06, 0x16,
    690         0x08, 0x18, 0x0A, 0x1A,
    691         0x0C, 0x1C, 0x0E, 0x1E );
    692 */
    693 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
    694 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
    695 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
    696 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
    697     ? vec_lvsl(0, src) \
    698     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
    699 
    700 
    701 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
    702     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
    703     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
    704     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
    705     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
    706     /* valpha2 is 255-alpha */ \
    707     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
    708     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
    709     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
    710     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
    711     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
    712     /* add source and dest */ \
    713     vtemp1 = vec_add(vtemp1, vtemp3); \
    714     vtemp2 = vec_add(vtemp2, vtemp4); \
    715     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
    716     vtemp1 = vec_add(vtemp1, v1_16); \
    717     vtemp3 = vec_sr(vtemp1, v8_16); \
    718     vtemp1 = vec_add(vtemp1, vtemp3); \
    719     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
    720     vtemp2 = vec_add(vtemp2, v1_16); \
    721     vtemp4 = vec_sr(vtemp2, v8_16); \
    722     vtemp2 = vec_add(vtemp2, vtemp4); \
    723     /* (>>8) and get ARGBARGBARGBARGB */ \
    724     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
    725 } while (0)
    726 
    727 /* Calculate the permute vector used for 32->32 swizzling */
    728 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
    729                                   const SDL_PixelFormat *dstfmt)
    730 {
    731     /*
    732      * We have to assume that the bits that aren't used by other
    733      *  colors is alpha, and it's one complete byte, since some formats
    734      *  leave alpha with a zero mask, but we should still swizzle the bits.
    735      */
    736     /* ARGB */
    737     const static struct SDL_PixelFormat default_pixel_format = {
    738         NULL, 0, 0,
    739         0, 0, 0, 0,
    740         16, 8, 0, 24,
    741         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
    742         0, 0};
    743     if (!srcfmt) {
    744         srcfmt = &default_pixel_format;
    745     }
    746     if (!dstfmt) {
    747         dstfmt = &default_pixel_format;
    748     }
    749     const vector unsigned char plus = VECUINT8_LITERAL
    750                                             ( 0x00, 0x00, 0x00, 0x00,
    751                                               0x04, 0x04, 0x04, 0x04,
    752                                               0x08, 0x08, 0x08, 0x08,
    753                                               0x0C, 0x0C, 0x0C, 0x0C );
    754     vector unsigned char vswiz;
    755     vector unsigned int srcvec;
    756 #define RESHIFT(X) (3 - ((X) >> 3))
    757     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
    758     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
    759     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
    760     Uint32 amask;
    761     /* Use zero for alpha if either surface doesn't have alpha */
    762     if (dstfmt->Amask) {
    763         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
    764     } else {
    765         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
    766     }
    767 #undef RESHIFT
    768     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
    769     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
    770     return(vswiz);
    771 }
    772 
    773 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
    774 {
    775     int height = info->d_height;
    776     Uint8 *src = (Uint8 *)info->s_pixels;
    777     int srcskip = info->s_skip;
    778     Uint8 *dst = (Uint8 *)info->d_pixels;
    779     int dstskip = info->d_skip;
    780     SDL_PixelFormat *srcfmt = info->src;
    781 
    782     vector unsigned char v0 = vec_splat_u8(0);
    783     vector unsigned short v8_16 = vec_splat_u16(8);
    784     vector unsigned short v1_16 = vec_splat_u16(1);
    785     vector unsigned short v2_16 = vec_splat_u16(2);
    786     vector unsigned short v3_16 = vec_splat_u16(3);
    787     vector unsigned int v8_32 = vec_splat_u32(8);
    788     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
    789     vector unsigned short v3f = VECUINT16_LITERAL(
    790         0x003f, 0x003f, 0x003f, 0x003f,
    791         0x003f, 0x003f, 0x003f, 0x003f);
    792     vector unsigned short vfc = VECUINT16_LITERAL(
    793         0x00fc, 0x00fc, 0x00fc, 0x00fc,
    794         0x00fc, 0x00fc, 0x00fc, 0x00fc);
    795 
    796     /*
    797         0x10 - 0x1f is the alpha
    798         0x00 - 0x0e evens are the red
    799         0x01 - 0x0f odds are zero
    800     */
    801     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
    802         0x10, 0x00, 0x01, 0x01,
    803         0x10, 0x02, 0x01, 0x01,
    804         0x10, 0x04, 0x01, 0x01,
    805         0x10, 0x06, 0x01, 0x01
    806     );
    807     vector unsigned char vredalpha2 = (vector unsigned char)(
    808         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
    809     );
    810     /*
    811         0x00 - 0x0f is ARxx ARxx ARxx ARxx
    812         0x11 - 0x0f odds are blue
    813     */
    814     vector unsigned char vblue1 = VECUINT8_LITERAL(
    815         0x00, 0x01, 0x02, 0x11,
    816         0x04, 0x05, 0x06, 0x13,
    817         0x08, 0x09, 0x0a, 0x15,
    818         0x0c, 0x0d, 0x0e, 0x17
    819     );
    820     vector unsigned char vblue2 = (vector unsigned char)(
    821         vec_add((vector unsigned int)vblue1, v8_32)
    822     );
    823     /*
    824         0x00 - 0x0f is ARxB ARxB ARxB ARxB
    825         0x10 - 0x0e evens are green
    826     */
    827     vector unsigned char vgreen1 = VECUINT8_LITERAL(
    828         0x00, 0x01, 0x10, 0x03,
    829         0x04, 0x05, 0x12, 0x07,
    830         0x08, 0x09, 0x14, 0x0b,
    831         0x0c, 0x0d, 0x16, 0x0f
    832     );
    833     vector unsigned char vgreen2 = (vector unsigned char)(
    834         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
    835     );
    836     vector unsigned char vgmerge = VECUINT8_LITERAL(
    837         0x00, 0x02, 0x00, 0x06,
    838         0x00, 0x0a, 0x00, 0x0e,
    839         0x00, 0x12, 0x00, 0x16,
    840         0x00, 0x1a, 0x00, 0x1e);
    841     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
    842     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
    843     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
    844 
    845     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
    846     vf800 = vec_sl(vf800, vec_splat_u16(8));
    847 
    848     while(height--) {
    849         int extrawidth;
    850         vector unsigned char valigner;
    851         vector unsigned char vsrc;
    852         vector unsigned char voverflow;
    853         int width = info->d_width;
    854 
    855 #define ONE_PIXEL_BLEND(condition, widthvar) \
    856         while (condition) { \
    857             Uint32 Pixel; \
    858             unsigned sR, sG, sB, dR, dG, dB, sA; \
    859             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
    860             if(sA) { \
    861                 unsigned short dstpixel = *((unsigned short *)dst); \
    862                 dR = (dstpixel >> 8) & 0xf8; \
    863                 dG = (dstpixel >> 3) & 0xfc; \
    864                 dB = (dstpixel << 3) & 0xf8; \
    865                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
    866                 *((unsigned short *)dst) = ( \
    867                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
    868                 ); \
    869             } \
    870             src += 4; \
    871             dst += 2; \
    872             widthvar--; \
    873         }
    874         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
    875         extrawidth = (width % 8);
    876         valigner = VEC_ALIGNER(src);
    877         vsrc = (vector unsigned char)vec_ld(0, src);
    878         width -= extrawidth;
    879         while (width) {
    880             vector unsigned char valpha;
    881             vector unsigned char vsrc1, vsrc2;
    882             vector unsigned char vdst1, vdst2;
    883             vector unsigned short vR, vG, vB;
    884             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
    885 
    886             /* Load 8 pixels from src as ARGB */
    887             voverflow = (vector unsigned char)vec_ld(15, src);
    888             vsrc = vec_perm(vsrc, voverflow, valigner);
    889             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
    890             src += 16;
    891             vsrc = (vector unsigned char)vec_ld(15, src);
    892             voverflow = vec_perm(voverflow, vsrc, valigner);
    893             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
    894             src += 16;
    895 
    896             /* Load 8 pixels from dst as XRGB */
    897             voverflow = vec_ld(0, dst);
    898             vR = vec_and((vector unsigned short)voverflow, vf800);
    899             vB = vec_sl((vector unsigned short)voverflow, v3_16);
    900             vG = vec_sl(vB, v2_16);
    901             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
    902             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
    903             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
    904             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
    905             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
    906             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
    907 
    908             /* Alpha blend 8 pixels as ARGB */
    909             valpha = vec_perm(vsrc1, v0, valphaPermute);
    910             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
    911             valpha = vec_perm(vsrc2, v0, valphaPermute);
    912             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
    913 
    914             /* Convert 8 pixels to 565 */
    915             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
    916             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
    917             vgpixel = vec_and(vgpixel, vfc);
    918             vgpixel = vec_sl(vgpixel, v3_16);
    919             vrpixel = vec_sl(vpixel, v1_16);
    920             vrpixel = vec_and(vrpixel, vf800);
    921             vbpixel = vec_and(vpixel, v3f);
    922             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
    923             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
    924 
    925             /* Store 8 pixels */
    926             vec_st(vdst1, 0, dst);
    927 
    928             width -= 8;
    929             dst += 16;
    930         }
    931         ONE_PIXEL_BLEND((extrawidth), extrawidth);
    932 #undef ONE_PIXEL_BLEND
    933         src += srcskip;
    934         dst += dstskip;
    935     }
    936 }
    937 
    938 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
    939 {
    940     unsigned alpha = info->src->alpha;
    941     int height = info->d_height;
    942     Uint32 *srcp = (Uint32 *)info->s_pixels;
    943     int srcskip = info->s_skip >> 2;
    944     Uint32 *dstp = (Uint32 *)info->d_pixels;
    945     int dstskip = info->d_skip >> 2;
    946     SDL_PixelFormat *srcfmt = info->src;
    947     SDL_PixelFormat *dstfmt = info->dst;
    948     unsigned sA = srcfmt->alpha;
    949     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
    950     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
    951     Uint32 ckey = info->src->colorkey;
    952     vector unsigned char mergePermute;
    953     vector unsigned char vsrcPermute;
    954     vector unsigned char vdstPermute;
    955     vector unsigned char vsdstPermute;
    956     vector unsigned char valpha;
    957     vector unsigned char valphamask;
    958     vector unsigned char vbits;
    959     vector unsigned char v0;
    960     vector unsigned short v1;
    961     vector unsigned short v8;
    962     vector unsigned int vckey;
    963     vector unsigned int vrgbmask;
    964 
    965     mergePermute = VEC_MERGE_PERMUTE();
    966     v0 = vec_splat_u8(0);
    967     v1 = vec_splat_u16(1);
    968     v8 = vec_splat_u16(8);
    969 
    970     /* set the alpha to 255 on the destination surf */
    971     valphamask = VEC_ALPHA_MASK();
    972 
    973     vsrcPermute = calc_swizzle32(srcfmt, NULL);
    974     vdstPermute = calc_swizzle32(NULL, dstfmt);
    975     vsdstPermute = calc_swizzle32(dstfmt, NULL);
    976 
    977     /* set a vector full of alpha and 255-alpha */
    978     ((unsigned char *)&valpha)[0] = alpha;
    979     valpha = vec_splat(valpha, 0);
    980     vbits = (vector unsigned char)vec_splat_s8(-1);
    981 
    982     ckey &= rgbmask;
    983     ((unsigned int *)(char*)&vckey)[0] = ckey;
    984     vckey = vec_splat(vckey, 0);
    985     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
    986     vrgbmask = vec_splat(vrgbmask, 0);
    987 
    988     while(height--) {
    989         int width = info->d_width;
    990 #define ONE_PIXEL_BLEND(condition, widthvar) \
    991         while (condition) { \
    992             Uint32 Pixel; \
    993             unsigned sR, sG, sB, dR, dG, dB; \
    994             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
    995             if(sA && Pixel != ckey) { \
    996                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
    997                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
    998                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
    999                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   1000             } \
   1001             dstp++; \
   1002             srcp++; \
   1003             widthvar--; \
   1004         }
   1005         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1006         if (width > 0) {
   1007             int extrawidth = (width % 4);
   1008             vector unsigned char valigner = VEC_ALIGNER(srcp);
   1009             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   1010             width -= extrawidth;
   1011             while (width) {
   1012                 vector unsigned char vsel;
   1013                 vector unsigned char voverflow;
   1014                 vector unsigned char vd;
   1015                 vector unsigned char vd_orig;
   1016 
   1017                 /* s = *srcp */
   1018                 voverflow = (vector unsigned char)vec_ld(15, srcp);
   1019                 vs = vec_perm(vs, voverflow, valigner);
   1020 
   1021                 /* vsel is set for items that match the key */
   1022                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
   1023                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
   1024 
   1025                 /* permute to source format */
   1026                 vs = vec_perm(vs, valpha, vsrcPermute);
   1027 
   1028                 /* d = *dstp */
   1029                 vd = (vector unsigned char)vec_ld(0, dstp);
   1030                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
   1031 
   1032                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   1033 
   1034                 /* set the alpha channel to full on */
   1035                 vd = vec_or(vd, valphamask);
   1036 
   1037                 /* mask out color key */
   1038                 vd = vec_sel(vd, vd_orig, vsel);
   1039 
   1040                 /* permute to dest format */
   1041                 vd = vec_perm(vd, vbits, vdstPermute);
   1042 
   1043                 /* *dstp = res */
   1044                 vec_st((vector unsigned int)vd, 0, dstp);
   1045 
   1046                 srcp += 4;
   1047                 dstp += 4;
   1048                 width -= 4;
   1049                 vs = voverflow;
   1050             }
   1051             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1052         }
   1053 #undef ONE_PIXEL_BLEND
   1054 
   1055         srcp += srcskip;
   1056         dstp += dstskip;
   1057     }
   1058 }
   1059 
   1060 
   1061 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
   1062 {
   1063     int width = info->d_width;
   1064     int height = info->d_height;
   1065     Uint32 *srcp = (Uint32 *)info->s_pixels;
   1066     int srcskip = info->s_skip >> 2;
   1067     Uint32 *dstp = (Uint32 *)info->d_pixels;
   1068     int dstskip = info->d_skip >> 2;
   1069     SDL_PixelFormat *srcfmt = info->src;
   1070     SDL_PixelFormat *dstfmt = info->dst;
   1071     vector unsigned char mergePermute;
   1072     vector unsigned char valphaPermute;
   1073     vector unsigned char vsrcPermute;
   1074     vector unsigned char vdstPermute;
   1075     vector unsigned char vsdstPermute;
   1076     vector unsigned char valphamask;
   1077     vector unsigned char vpixelmask;
   1078     vector unsigned char v0;
   1079     vector unsigned short v1;
   1080     vector unsigned short v8;
   1081 
   1082     v0 = vec_splat_u8(0);
   1083     v1 = vec_splat_u16(1);
   1084     v8 = vec_splat_u16(8);
   1085     mergePermute = VEC_MERGE_PERMUTE();
   1086     valphamask = VEC_ALPHA_MASK();
   1087     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   1088     vpixelmask = vec_nor(valphamask, v0);
   1089     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   1090     vdstPermute = calc_swizzle32(NULL, dstfmt);
   1091     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   1092 
   1093 	while ( height-- ) {
   1094         width = info->d_width;
   1095 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   1096             Uint32 Pixel; \
   1097             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
   1098             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   1099             if(sA) { \
   1100               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
   1101               ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   1102               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
   1103             } \
   1104             ++srcp; \
   1105             ++dstp; \
   1106             widthvar--; \
   1107         }
   1108         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1109         if (width > 0) {
   1110             /* vsrcPermute */
   1111             /* vdstPermute */
   1112             int extrawidth = (width % 4);
   1113             vector unsigned char valigner = VEC_ALIGNER(srcp);
   1114             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   1115             width -= extrawidth;
   1116             while (width) {
   1117                 vector unsigned char voverflow;
   1118                 vector unsigned char vd;
   1119                 vector unsigned char valpha;
   1120                 vector unsigned char vdstalpha;
   1121                 /* s = *srcp */
   1122                 voverflow = (vector unsigned char)vec_ld(15, srcp);
   1123                 vs = vec_perm(vs, voverflow, valigner);
   1124                 vs = vec_perm(vs, v0, vsrcPermute);
   1125 
   1126                 valpha = vec_perm(vs, v0, valphaPermute);
   1127 
   1128                 /* d = *dstp */
   1129                 vd = (vector unsigned char)vec_ld(0, dstp);
   1130                 vd = vec_perm(vd, v0, vsdstPermute);
   1131                 vdstalpha = vec_and(vd, valphamask);
   1132 
   1133                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   1134 
   1135                 /* set the alpha to the dest alpha */
   1136                 vd = vec_and(vd, vpixelmask);
   1137                 vd = vec_or(vd, vdstalpha);
   1138                 vd = vec_perm(vd, v0, vdstPermute);
   1139 
   1140                 /* *dstp = res */
   1141                 vec_st((vector unsigned int)vd, 0, dstp);
   1142 
   1143                 srcp += 4;
   1144                 dstp += 4;
   1145                 width -= 4;
   1146                 vs = voverflow;
   1147 
   1148             }
   1149             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1150         }
   1151 	    srcp += srcskip;
   1152 	    dstp += dstskip;
   1153 #undef ONE_PIXEL_BLEND
   1154 	}
   1155 }
   1156 
   1157 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   1158 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
   1159 {
   1160 	int width = info->d_width;
   1161 	int height = info->d_height;
   1162 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   1163 	int srcskip = info->s_skip >> 2;
   1164 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   1165 	int dstskip = info->d_skip >> 2;
   1166     vector unsigned char mergePermute;
   1167     vector unsigned char valphaPermute;
   1168     vector unsigned char valphamask;
   1169     vector unsigned char vpixelmask;
   1170     vector unsigned char v0;
   1171     vector unsigned short v1;
   1172     vector unsigned short v8;
   1173     v0 = vec_splat_u8(0);
   1174     v1 = vec_splat_u16(1);
   1175     v8 = vec_splat_u16(8);
   1176     mergePermute = VEC_MERGE_PERMUTE();
   1177     valphamask = VEC_ALPHA_MASK();
   1178     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   1179 
   1180 
   1181     vpixelmask = vec_nor(valphamask, v0);
   1182 	while(height--) {
   1183         width = info->d_width;
   1184 #define ONE_PIXEL_BLEND(condition, widthvar) \
   1185         while ((condition)) { \
   1186             Uint32 dalpha; \
   1187             Uint32 d; \
   1188             Uint32 s1; \
   1189             Uint32 d1; \
   1190             Uint32 s = *srcp; \
   1191             Uint32 alpha = s >> 24; \
   1192             if(alpha) { \
   1193               if(alpha == SDL_ALPHA_OPAQUE) { \
   1194                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
   1195               } else { \
   1196                 d = *dstp; \
   1197                 dalpha = d & 0xff000000; \
   1198                 s1 = s & 0xff00ff; \
   1199                 d1 = d & 0xff00ff; \
   1200                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
   1201                 s &= 0xff00; \
   1202                 d &= 0xff00; \
   1203                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
   1204                 *dstp = d1 | d | dalpha; \
   1205               } \
   1206             } \
   1207             ++srcp; \
   1208             ++dstp; \
   1209             widthvar--; \
   1210 	    }
   1211         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1212         if (width > 0) {
   1213             int extrawidth = (width % 4);
   1214             vector unsigned char valigner = VEC_ALIGNER(srcp);
   1215             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   1216             width -= extrawidth;
   1217             while (width) {
   1218                 vector unsigned char voverflow;
   1219                 vector unsigned char vd;
   1220                 vector unsigned char valpha;
   1221                 vector unsigned char vdstalpha;
   1222                 /* s = *srcp */
   1223                 voverflow = (vector unsigned char)vec_ld(15, srcp);
   1224                 vs = vec_perm(vs, voverflow, valigner);
   1225 
   1226                 valpha = vec_perm(vs, v0, valphaPermute);
   1227 
   1228                 /* d = *dstp */
   1229                 vd = (vector unsigned char)vec_ld(0, dstp);
   1230                 vdstalpha = vec_and(vd, valphamask);
   1231 
   1232                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   1233 
   1234                 /* set the alpha to the dest alpha */
   1235                 vd = vec_and(vd, vpixelmask);
   1236                 vd = vec_or(vd, vdstalpha);
   1237 
   1238                 /* *dstp = res */
   1239                 vec_st((vector unsigned int)vd, 0, dstp);
   1240 
   1241                 srcp += 4;
   1242                 dstp += 4;
   1243                 width -= 4;
   1244                 vs = voverflow;
   1245             }
   1246             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1247         }
   1248 	    srcp += srcskip;
   1249 	    dstp += dstskip;
   1250 	}
   1251 #undef ONE_PIXEL_BLEND
   1252 }
   1253 
   1254 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
   1255 {
   1256     /* XXX : 6 */
   1257 	unsigned alpha = info->src->alpha;
   1258     int height = info->d_height;
   1259     Uint32 *srcp = (Uint32 *)info->s_pixels;
   1260     int srcskip = info->s_skip >> 2;
   1261     Uint32 *dstp = (Uint32 *)info->d_pixels;
   1262     int dstskip = info->d_skip >> 2;
   1263     SDL_PixelFormat *srcfmt = info->src;
   1264     SDL_PixelFormat *dstfmt = info->dst;
   1265 	unsigned sA = srcfmt->alpha;
   1266 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   1267     vector unsigned char mergePermute;
   1268     vector unsigned char vsrcPermute;
   1269     vector unsigned char vdstPermute;
   1270     vector unsigned char vsdstPermute;
   1271     vector unsigned char valpha;
   1272     vector unsigned char valphamask;
   1273     vector unsigned char vbits;
   1274     vector unsigned short v1;
   1275     vector unsigned short v8;
   1276 
   1277     mergePermute = VEC_MERGE_PERMUTE();
   1278     v1 = vec_splat_u16(1);
   1279     v8 = vec_splat_u16(8);
   1280 
   1281     /* set the alpha to 255 on the destination surf */
   1282     valphamask = VEC_ALPHA_MASK();
   1283 
   1284     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   1285     vdstPermute = calc_swizzle32(NULL, dstfmt);
   1286     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   1287 
   1288     /* set a vector full of alpha and 255-alpha */
   1289     ((unsigned char *)&valpha)[0] = alpha;
   1290     valpha = vec_splat(valpha, 0);
   1291     vbits = (vector unsigned char)vec_splat_s8(-1);
   1292 
   1293     while(height--) {
   1294         int width = info->d_width;
   1295 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   1296             Uint32 Pixel; \
   1297             unsigned sR, sG, sB, dR, dG, dB; \
   1298             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
   1299             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   1300             ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   1301             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   1302             ++srcp; \
   1303             ++dstp; \
   1304             widthvar--; \
   1305         }
   1306         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1307         if (width > 0) {
   1308             int extrawidth = (width % 4);
   1309             vector unsigned char valigner = VEC_ALIGNER(srcp);
   1310             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   1311             width -= extrawidth;
   1312             while (width) {
   1313                 vector unsigned char voverflow;
   1314                 vector unsigned char vd;
   1315 
   1316                 /* s = *srcp */
   1317                 voverflow = (vector unsigned char)vec_ld(15, srcp);
   1318                 vs = vec_perm(vs, voverflow, valigner);
   1319                 vs = vec_perm(vs, valpha, vsrcPermute);
   1320 
   1321                 /* d = *dstp */
   1322                 vd = (vector unsigned char)vec_ld(0, dstp);
   1323                 vd = vec_perm(vd, vd, vsdstPermute);
   1324 
   1325                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   1326 
   1327                 /* set the alpha channel to full on */
   1328                 vd = vec_or(vd, valphamask);
   1329                 vd = vec_perm(vd, vbits, vdstPermute);
   1330 
   1331                 /* *dstp = res */
   1332                 vec_st((vector unsigned int)vd, 0, dstp);
   1333 
   1334                 srcp += 4;
   1335                 dstp += 4;
   1336                 width -= 4;
   1337                 vs = voverflow;
   1338             }
   1339             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1340         }
   1341 #undef ONE_PIXEL_BLEND
   1342 
   1343         srcp += srcskip;
   1344         dstp += dstskip;
   1345     }
   1346 
   1347 }
   1348 
   1349 
   1350 /* fast RGB888->(A)RGB888 blending */
   1351 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
   1352 {
   1353 	unsigned alpha = info->src->alpha;
   1354     int height = info->d_height;
   1355     Uint32 *srcp = (Uint32 *)info->s_pixels;
   1356     int srcskip = info->s_skip >> 2;
   1357     Uint32 *dstp = (Uint32 *)info->d_pixels;
   1358     int dstskip = info->d_skip >> 2;
   1359     vector unsigned char mergePermute;
   1360     vector unsigned char valpha;
   1361     vector unsigned char valphamask;
   1362     vector unsigned short v1;
   1363     vector unsigned short v8;
   1364 
   1365     mergePermute = VEC_MERGE_PERMUTE();
   1366     v1 = vec_splat_u16(1);
   1367     v8 = vec_splat_u16(8);
   1368 
   1369     /* set the alpha to 255 on the destination surf */
   1370     valphamask = VEC_ALPHA_MASK();
   1371 
   1372     /* set a vector full of alpha and 255-alpha */
   1373     ((unsigned char *)&valpha)[0] = alpha;
   1374     valpha = vec_splat(valpha, 0);
   1375 
   1376     while(height--) {
   1377         int width = info->d_width;
   1378 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   1379             Uint32 s = *srcp; \
   1380             Uint32 d = *dstp; \
   1381             Uint32 s1 = s & 0xff00ff; \
   1382             Uint32 d1 = d & 0xff00ff; \
   1383             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
   1384                  & 0xff00ff; \
   1385             s &= 0xff00; \
   1386             d &= 0xff00; \
   1387             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
   1388             *dstp = d1 | d | 0xff000000; \
   1389             ++srcp; \
   1390             ++dstp; \
   1391             widthvar--; \
   1392         }
   1393         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1394         if (width > 0) {
   1395             int extrawidth = (width % 4);
   1396             vector unsigned char valigner = VEC_ALIGNER(srcp);
   1397             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   1398             width -= extrawidth;
   1399             while (width) {
   1400                 vector unsigned char voverflow;
   1401                 vector unsigned char vd;
   1402 
   1403                 /* s = *srcp */
   1404                 voverflow = (vector unsigned char)vec_ld(15, srcp);
   1405                 vs = vec_perm(vs, voverflow, valigner);
   1406 
   1407                 /* d = *dstp */
   1408                 vd = (vector unsigned char)vec_ld(0, dstp);
   1409 
   1410                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   1411 
   1412                 /* set the alpha channel to full on */
   1413                 vd = vec_or(vd, valphamask);
   1414 
   1415                 /* *dstp = res */
   1416                 vec_st((vector unsigned int)vd, 0, dstp);
   1417 
   1418                 srcp += 4;
   1419                 dstp += 4;
   1420                 width -= 4;
   1421                 vs = voverflow;
   1422             }
   1423             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1424         }
   1425 #undef ONE_PIXEL_BLEND
   1426 
   1427         srcp += srcskip;
   1428         dstp += dstskip;
   1429     }
   1430 }
   1431 #if __MWERKS__
   1432 #pragma altivec_model off
   1433 #endif
   1434 #endif /* SDL_ALTIVEC_BLITTERS */
   1435 
   1436 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   1437 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
   1438 {
   1439 	int width = info->d_width;
   1440 	int height = info->d_height;
   1441 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   1442 	int srcskip = info->s_skip >> 2;
   1443 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   1444 	int dstskip = info->d_skip >> 2;
   1445 
   1446 	while(height--) {
   1447 	    DUFFS_LOOP4({
   1448 		    Uint32 s = *srcp++;
   1449 		    Uint32 d = *dstp;
   1450 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   1451 			       + (s & d & 0x00010101)) | 0xff000000;
   1452 	    }, width);
   1453 	    srcp += srcskip;
   1454 	    dstp += dstskip;
   1455 	}
   1456 }
   1457 
   1458 /* fast RGB888->(A)RGB888 blending with surface alpha */
   1459 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
   1460 {
   1461 	unsigned alpha = info->src->alpha;
   1462 	if(alpha == 128) {
   1463 		BlitRGBtoRGBSurfaceAlpha128(info);
   1464 	} else {
   1465 		int width = info->d_width;
   1466 		int height = info->d_height;
   1467 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   1468 		int srcskip = info->s_skip >> 2;
   1469 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   1470 		int dstskip = info->d_skip >> 2;
   1471 		Uint32 s;
   1472 		Uint32 d;
   1473 		Uint32 s1;
   1474 		Uint32 d1;
   1475 
   1476 		while(height--) {
   1477 			DUFFS_LOOP_DOUBLE2({
   1478 				/* One Pixel Blend */
   1479 				s = *srcp;
   1480 				d = *dstp;
   1481 				s1 = s & 0xff00ff;
   1482 				d1 = d & 0xff00ff;
   1483 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
   1484 				     & 0xff00ff;
   1485 				s &= 0xff00;
   1486 				d &= 0xff00;
   1487 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   1488 				*dstp = d1 | d | 0xff000000;
   1489 				++srcp;
   1490 				++dstp;
   1491 			},{
   1492 			        /* Two Pixels Blend */
   1493 				s = *srcp;
   1494 				d = *dstp;
   1495 				s1 = s & 0xff00ff;
   1496 				d1 = d & 0xff00ff;
   1497 				d1 += (s1 - d1) * alpha >> 8;
   1498 				d1 &= 0xff00ff;
   1499 
   1500 				s = ((s & 0xff00) >> 8) |
   1501 					((srcp[1] & 0xff00) << 8);
   1502 				d = ((d & 0xff00) >> 8) |
   1503 					((dstp[1] & 0xff00) << 8);
   1504 				d += (s - d) * alpha >> 8;
   1505 				d &= 0x00ff00ff;
   1506 
   1507 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
   1508 				++srcp;
   1509 
   1510 			        s1 = *srcp;
   1511 				d1 = *dstp;
   1512 				s1 &= 0xff00ff;
   1513 				d1 &= 0xff00ff;
   1514 				d1 += (s1 - d1) * alpha >> 8;
   1515 				d1 &= 0xff00ff;
   1516 
   1517 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
   1518 				++srcp;
   1519 				++dstp;
   1520 			}, width);
   1521 			srcp += srcskip;
   1522 			dstp += dstskip;
   1523 		}
   1524 	}
   1525 }
   1526 
   1527 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   1528 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
   1529 {
   1530 	int width = info->d_width;
   1531 	int height = info->d_height;
   1532 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   1533 	int srcskip = info->s_skip >> 2;
   1534 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   1535 	int dstskip = info->d_skip >> 2;
   1536 
   1537 	while(height--) {
   1538 	    DUFFS_LOOP4({
   1539 		Uint32 dalpha;
   1540 		Uint32 d;
   1541 		Uint32 s1;
   1542 		Uint32 d1;
   1543 		Uint32 s = *srcp;
   1544 		Uint32 alpha = s >> 24;
   1545 		/* FIXME: Here we special-case opaque alpha since the
   1546 		   compositioning used (>>8 instead of /255) doesn't handle
   1547 		   it correctly. Also special-case alpha=0 for speed?
   1548 		   Benchmark this! */
   1549 		if(alpha) {
   1550 		  if(alpha == SDL_ALPHA_OPAQUE) {
   1551 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
   1552 		  } else {
   1553 		    /*
   1554 		     * take out the middle component (green), and process
   1555 		     * the other two in parallel. One multiply less.
   1556 		     */
   1557 		    d = *dstp;
   1558 		    dalpha = d & 0xff000000;
   1559 		    s1 = s & 0xff00ff;
   1560 		    d1 = d & 0xff00ff;
   1561 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
   1562 		    s &= 0xff00;
   1563 		    d &= 0xff00;
   1564 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   1565 		    *dstp = d1 | d | dalpha;
   1566 		  }
   1567 		}
   1568 		++srcp;
   1569 		++dstp;
   1570 	    }, width);
   1571 	    srcp += srcskip;
   1572 	    dstp += dstskip;
   1573 	}
   1574 }
   1575 
   1576 #if GCC_ASMBLIT
   1577 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   1578 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
   1579 {
   1580 	int width = info->d_width;
   1581 	int height = info->d_height;
   1582 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   1583 	int srcskip = info->s_skip >> 2;
   1584 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   1585 	int dstskip = info->d_skip >> 2;
   1586 	SDL_PixelFormat* sf = info->src;
   1587 	Uint32 amask = sf->Amask;
   1588 
   1589 	__asm__ (
   1590 	/* make mm6 all zeros. */
   1591 	"pxor       %%mm6, %%mm6\n"
   1592 
   1593 	/* Make a mask to preserve the alpha. */
   1594 	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
   1595 	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
   1596 	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
   1597 	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
   1598 	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
   1599 
   1600 	/* form channel masks */
   1601 	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
   1602 	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
   1603 	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
   1604 	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
   1605 
   1606 	/* get alpha channel shift */
   1607 	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
   1608 
   1609 	  : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
   1610 
   1611 	while(height--) {
   1612 
   1613 	    DUFFS_LOOP4({
   1614 		Uint32 alpha;
   1615 
   1616 		__asm__ (
   1617 		"prefetch 64(%0)\n"
   1618 		"prefetch 64(%1)\n"
   1619 			: : "r" (srcp), "r" (dstp) );
   1620 
   1621 		alpha = *srcp & amask;
   1622 		/* FIXME: Here we special-case opaque alpha since the
   1623 		   compositioning used (>>8 instead of /255) doesn't handle
   1624 		   it correctly. Also special-case alpha=0 for speed?
   1625 		   Benchmark this! */
   1626 		if(alpha == 0) {
   1627 		    /* do nothing */
   1628 		}
   1629 		else if(alpha == amask) {
   1630 			/* opaque alpha -- copy RGB, keep dst alpha */
   1631 		    /* using MMX here to free up regular registers for other things */
   1632 			    __asm__ (
   1633 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
   1634 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
   1635 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
   1636 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
   1637 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
   1638 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
   1639 
   1640 		     : : "r" (srcp), "r" (dstp) );
   1641 		}
   1642 
   1643 		else {
   1644 			    __asm__ (
   1645 		    /* load in the source, and dst. */
   1646 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
   1647 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
   1648 
   1649 		    /* Move the src alpha into mm2 */
   1650 
   1651 		    /* if supporting pshufw */
   1652 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
   1653 		    /*"psrlw     $8, %%mm2\n" */
   1654 
   1655 		    /* else: */
   1656 		    "movd       %2,    %%mm2\n"
   1657 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
   1658 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
   1659 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
   1660 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
   1661 
   1662 		    /* move the colors into words. */
   1663 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
   1664 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
   1665 
   1666 		    /* src - dst */
   1667 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
   1668 
   1669 		    /* A * (src-dst) */
   1670 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
   1671 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
   1672 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
   1673 
   1674 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
   1675 
   1676 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
   1677 
   1678 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
   1679 
   1680 		}
   1681 		++srcp;
   1682 		++dstp;
   1683 	    }, width);
   1684 	    srcp += srcskip;
   1685 	    dstp += dstskip;
   1686 	}
   1687 
   1688 	__asm__ (
   1689 	"emms\n"
   1690 		:   );
   1691 }
   1692 /* End GCC_ASMBLIT*/
   1693 
   1694 #elif MSVC_ASMBLIT
   1695 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   1696 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
   1697 {
   1698 	int width = info->d_width;
   1699 	int height = info->d_height;
   1700 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   1701 	int srcskip = info->s_skip >> 2;
   1702 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   1703 	int dstskip = info->d_skip >> 2;
   1704 	SDL_PixelFormat* sf = info->src;
   1705 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   1706 	Uint32 amask = sf->Amask;
   1707 	Uint32 ashift = sf->Ashift;
   1708 	Uint64 multmask;
   1709 
   1710 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
   1711 
   1712 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   1713 	multmask = ~(0xFFFFi64 << (ashift * 2));
   1714 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
   1715 
   1716 	while(height--) {
   1717 	    DUFFS_LOOP4({
   1718 		Uint32 alpha;
   1719 
   1720 		_m_prefetch(srcp + 16);
   1721 		_m_prefetch(dstp + 16);
   1722 
   1723 		alpha = *srcp & amask;
   1724 		if (alpha == 0) {
   1725 			/* do nothing */
   1726 		} else if (alpha == amask) {
   1727 			/* copy RGB, keep dst alpha */
   1728 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   1729 		} else {
   1730 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   1731 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   1732 
   1733 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   1734 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   1735 
   1736 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   1737 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   1738 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   1739 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   1740 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   1741 
   1742 			/* blend */
   1743 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
   1744 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
   1745 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   1746 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
   1747 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   1748 
   1749 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   1750 		}
   1751 		++srcp;
   1752 		++dstp;
   1753 	    }, width);
   1754 	    srcp += srcskip;
   1755 	    dstp += dstskip;
   1756 	}
   1757 	_mm_empty();
   1758 }
   1759 /* End MSVC_ASMBLIT */
   1760 
   1761 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   1762 
   1763 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
   1764 
   1765 /* blend a single 16 bit pixel at 50% */
   1766 #define BLEND16_50(d, s, mask)						\
   1767 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
   1768 
   1769 /* blend two 16 bit pixels at 50% */
   1770 #define BLEND2x16_50(d, s, mask)					     \
   1771 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
   1772 	 + (s & d & (~(mask | mask << 16))))
   1773 
   1774 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
   1775 {
   1776 	int width = info->d_width;
   1777 	int height = info->d_height;
   1778 	Uint16 *srcp = (Uint16 *)info->s_pixels;
   1779 	int srcskip = info->s_skip >> 1;
   1780 	Uint16 *dstp = (Uint16 *)info->d_pixels;
   1781 	int dstskip = info->d_skip >> 1;
   1782 
   1783 	while(height--) {
   1784 		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
   1785 			/*
   1786 			 * Source and destination not aligned, pipeline it.
   1787 			 * This is mostly a win for big blits but no loss for
   1788 			 * small ones
   1789 			 */
   1790 			Uint32 prev_sw;
   1791 			int w = width;
   1792 
   1793 			/* handle odd destination */
   1794 			if((uintptr_t)dstp & 2) {
   1795 				Uint16 d = *dstp, s = *srcp;
   1796 				*dstp = BLEND16_50(d, s, mask);
   1797 				dstp++;
   1798 				srcp++;
   1799 				w--;
   1800 			}
   1801 			srcp++;	/* srcp is now 32-bit aligned */
   1802 
   1803 			/* bootstrap pipeline with first halfword */
   1804 			prev_sw = ((Uint32 *)srcp)[-1];
   1805 
   1806 			while(w > 1) {
   1807 				Uint32 sw, dw, s;
   1808 				sw = *(Uint32 *)srcp;
   1809 				dw = *(Uint32 *)dstp;
   1810 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   1811 				s = (prev_sw << 16) + (sw >> 16);
   1812 #else
   1813 				s = (prev_sw >> 16) + (sw << 16);
   1814 #endif
   1815 				prev_sw = sw;
   1816 				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
   1817 				dstp += 2;
   1818 				srcp += 2;
   1819 				w -= 2;
   1820 			}
   1821 
   1822 			/* final pixel if any */
   1823 			if(w) {
   1824 				Uint16 d = *dstp, s;
   1825 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   1826 				s = (Uint16)prev_sw;
   1827 #else
   1828 				s = (Uint16)(prev_sw >> 16);
   1829 #endif
   1830 				*dstp = BLEND16_50(d, s, mask);
   1831 				srcp++;
   1832 				dstp++;
   1833 			}
   1834 			srcp += srcskip - 1;
   1835 			dstp += dstskip;
   1836 		} else {
   1837 			/* source and destination are aligned */
   1838 			int w = width;
   1839 
   1840 			/* first odd pixel? */
   1841 			if((uintptr_t)srcp & 2) {
   1842 				Uint16 d = *dstp, s = *srcp;
   1843 				*dstp = BLEND16_50(d, s, mask);
   1844 				srcp++;
   1845 				dstp++;
   1846 				w--;
   1847 			}
   1848 			/* srcp and dstp are now 32-bit aligned */
   1849 
   1850 			while(w > 1) {
   1851 				Uint32 sw = *(Uint32 *)srcp;
   1852 				Uint32 dw = *(Uint32 *)dstp;
   1853 				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
   1854 				srcp += 2;
   1855 				dstp += 2;
   1856 				w -= 2;
   1857 			}
   1858 
   1859 			/* last odd pixel? */
   1860 			if(w) {
   1861 				Uint16 d = *dstp, s = *srcp;
   1862 				*dstp = BLEND16_50(d, s, mask);
   1863 				srcp++;
   1864 				dstp++;
   1865 			}
   1866 			srcp += srcskip;
   1867 			dstp += dstskip;
   1868 		}
   1869 	}
   1870 }
   1871 
   1872 #if GCC_ASMBLIT
   1873 /* fast RGB565->RGB565 blending with surface alpha */
   1874 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
   1875 {
   1876 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
   1877 	if(alpha == 128) {
   1878 		Blit16to16SurfaceAlpha128(info, 0xf7de);
   1879 	} else {
   1880 		int width = info->d_width;
   1881 		int height = info->d_height;
   1882 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   1883 		int srcskip = info->s_skip >> 1;
   1884 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   1885 		int dstskip = info->d_skip >> 1;
   1886 		Uint32 s, d;
   1887 		Uint64 load;
   1888 
   1889 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
   1890 		load = alpha;
   1891 		alpha >>= 3;		/* downscale alpha to 5 bits */
   1892 
   1893 		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
   1894 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
   1895 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
   1896 		/* position alpha to allow for mullo and mulhi on diff channels
   1897 		   to reduce the number of operations */
   1898 		psllq_i2r(3, mm0);
   1899 
   1900 		/* Setup the 565 color channel masks */
   1901 		load = 0x07E007E007E007E0ULL;
   1902 		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
   1903 		load = 0x001F001F001F001FULL;
   1904 		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
   1905 		while(height--) {
   1906 			DUFFS_LOOP_QUATRO2(
   1907 			{
   1908 				s = *srcp++;
   1909 				d = *dstp;
   1910 				/*
   1911 				 * shift out the middle component (green) to
   1912 				 * the high 16 bits, and process all three RGB
   1913 				 * components at the same time.
   1914 				 */
   1915 				s = (s | s << 16) & 0x07e0f81f;
   1916 				d = (d | d << 16) & 0x07e0f81f;
   1917 				d += (s - d) * alpha >> 5;
   1918 				d &= 0x07e0f81f;
   1919 				*dstp++ = d | d >> 16;
   1920 			},{
   1921 				s = *srcp++;
   1922 				d = *dstp;
   1923 				/*
   1924 				 * shift out the middle component (green) to
   1925 				 * the high 16 bits, and process all three RGB
   1926 				 * components at the same time.
   1927 				 */
   1928 				s = (s | s << 16) & 0x07e0f81f;
   1929 				d = (d | d << 16) & 0x07e0f81f;
   1930 				d += (s - d) * alpha >> 5;
   1931 				d &= 0x07e0f81f;
   1932 				*dstp++ = d | d >> 16;
   1933 				s = *srcp++;
   1934 				d = *dstp;
   1935 				/*
   1936 				 * shift out the middle component (green) to
   1937 				 * the high 16 bits, and process all three RGB
   1938 				 * components at the same time.
   1939 				 */
   1940 				s = (s | s << 16) & 0x07e0f81f;
   1941 				d = (d | d << 16) & 0x07e0f81f;
   1942 				d += (s - d) * alpha >> 5;
   1943 				d &= 0x07e0f81f;
   1944 				*dstp++ = d | d >> 16;
   1945 			},{
   1946 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
   1947 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
   1948 
   1949 				/* red -- does not need a mask since the right shift clears
   1950 				   the uninteresting bits */
   1951 				movq_r2r(mm2, mm5); /* src -> mm5 */
   1952 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1953 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
   1954 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
   1955 
   1956 				/* blend */
   1957 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1958 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1959 				/* alpha used is actually 11 bits
   1960 				   11 + 5 = 16 bits, so the sign bits are lost */
   1961 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   1962 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1963 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
   1964 
   1965 				movq_r2r(mm6, mm1); /* save new reds in dsts */
   1966 
   1967 				/* green -- process the bits in place */
   1968 				movq_r2r(mm2, mm5); /* src -> mm5 */
   1969 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1970 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
   1971 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
   1972 
   1973 				/* blend */
   1974 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1975 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1976 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
   1977 				   bits are gone and the sign bits present */
   1978 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   1979 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1980 
   1981 				por_r2r(mm6, mm1); /* save new greens in dsts */
   1982 
   1983 				/* blue */
   1984 				movq_r2r(mm2, mm5); /* src -> mm5 */
   1985 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1986 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
   1987 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
   1988 
   1989 				/* blend */
   1990 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1991 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1992 				/* 11 + 5 = 16 bits, so the sign bits are lost and
   1993 				   the interesting bits will need to be MASKed */
   1994 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   1995 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1996 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
   1997 
   1998 				por_r2r(mm6, mm1); /* save new blues in dsts */
   1999 
   2000 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
   2001 
   2002 				srcp += 4;
   2003 				dstp += 4;
   2004 			}, width);
   2005 			srcp += srcskip;
   2006 			dstp += dstskip;
   2007 		}
   2008 		emms();
   2009 	}
   2010 }
   2011 
   2012 /* fast RGB555->RGB555 blending with surface alpha */
   2013 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
   2014 {
   2015 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
   2016 	if(alpha == 128) {
   2017 		Blit16to16SurfaceAlpha128(info, 0xfbde);
   2018 	} else {
   2019 		int width = info->d_width;
   2020 		int height = info->d_height;
   2021 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   2022 		int srcskip = info->s_skip >> 1;
   2023 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   2024 		int dstskip = info->d_skip >> 1;
   2025 		Uint32 s, d;
   2026 		Uint64 load;
   2027 
   2028 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
   2029 		load = alpha;
   2030 		alpha >>= 3;		/* downscale alpha to 5 bits */
   2031 
   2032 		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
   2033 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
   2034 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
   2035 		/* position alpha to allow for mullo and mulhi on diff channels
   2036 		   to reduce the number of operations */
   2037 		psllq_i2r(3, mm0);
   2038 
   2039 		/* Setup the 555 color channel masks */
   2040 		load = 0x03E003E003E003E0ULL;
   2041 		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
   2042 		load = 0x001F001F001F001FULL;
   2043 		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
   2044 		while(height--) {
   2045 			DUFFS_LOOP_QUATRO2(
   2046 			{
   2047 				s = *srcp++;
   2048 				d = *dstp;
   2049 				/*
   2050 				 * shift out the middle component (green) to
   2051 				 * the high 16 bits, and process all three RGB
   2052 				 * components at the same time.
   2053 				 */
   2054 				s = (s | s << 16) & 0x03e07c1f;
   2055 				d = (d | d << 16) & 0x03e07c1f;
   2056 				d += (s - d) * alpha >> 5;
   2057 				d &= 0x03e07c1f;
   2058 				*dstp++ = d | d >> 16;
   2059 			},{
   2060 				s = *srcp++;
   2061 				d = *dstp;
   2062 				/*
   2063 				 * shift out the middle component (green) to
   2064 				 * the high 16 bits, and process all three RGB
   2065 				 * components at the same time.
   2066 				 */
   2067 				s = (s | s << 16) & 0x03e07c1f;
   2068 				d = (d | d << 16) & 0x03e07c1f;
   2069 				d += (s - d) * alpha >> 5;
   2070 				d &= 0x03e07c1f;
   2071 				*dstp++ = d | d >> 16;
   2072 			        s = *srcp++;
   2073 				d = *dstp;
   2074 				/*
   2075 				 * shift out the middle component (green) to
   2076 				 * the high 16 bits, and process all three RGB
   2077 				 * components at the same time.
   2078 				 */
   2079 				s = (s | s << 16) & 0x03e07c1f;
   2080 				d = (d | d << 16) & 0x03e07c1f;
   2081 				d += (s - d) * alpha >> 5;
   2082 				d &= 0x03e07c1f;
   2083 				*dstp++ = d | d >> 16;
   2084 			},{
   2085 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
   2086 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
   2087 
   2088 				/* red -- process the bits in place */
   2089 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
   2090 					/* by reusing the GREEN mask we free up another mmx
   2091 					   register to accumulate the result */
   2092 
   2093 				movq_r2r(mm2, mm5); /* src -> mm5 */
   2094 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2095 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
   2096 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
   2097 
   2098 				/* blend */
   2099 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2100 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2101 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
   2102 				   cleared by a MASK below */
   2103 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   2104 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2105 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
   2106 
   2107 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
   2108 
   2109 				movq_r2r(mm6, mm1); /* save new reds in dsts */
   2110 
   2111 				/* green -- process the bits in place */
   2112 				movq_r2r(mm2, mm5); /* src -> mm5 */
   2113 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2114 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
   2115 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
   2116 
   2117 				/* blend */
   2118 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2119 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2120 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
   2121 				   bits are gone and the sign bits present */
   2122 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   2123 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2124 
   2125 				por_r2r(mm6, mm1); /* save new greens in dsts */
   2126 
   2127 				/* blue */
   2128 				movq_r2r(mm2, mm5); /* src -> mm5 */
   2129 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2130 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
   2131 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
   2132 
   2133 				/* blend */
   2134 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2135 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2136 				/* 11 + 5 = 16 bits, so the sign bits are lost and
   2137 				   the interesting bits will need to be MASKed */
   2138 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   2139 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2140 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
   2141 
   2142 				por_r2r(mm6, mm1); /* save new blues in dsts */
   2143 
   2144 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
   2145 
   2146 				srcp += 4;
   2147 				dstp += 4;
   2148 			}, width);
   2149 			srcp += srcskip;
   2150 			dstp += dstskip;
   2151 		}
   2152 		emms();
   2153 	}
   2154 }
   2155 /* End GCC_ASMBLIT */
   2156 
   2157 #elif MSVC_ASMBLIT
   2158 /* fast RGB565->RGB565 blending with surface alpha */
   2159 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
   2160 {
   2161 	unsigned alpha = info->src->alpha;
   2162 	if(alpha == 128) {
   2163 		Blit16to16SurfaceAlpha128(info, 0xf7de);
   2164 	} else {
   2165 		int width = info->d_width;
   2166 		int height = info->d_height;
   2167 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   2168 		int srcskip = info->s_skip >> 1;
   2169 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   2170 		int dstskip = info->d_skip >> 1;
   2171 		Uint32 s, d;
   2172 
   2173 		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
   2174 
   2175 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
   2176 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
   2177 		alpha >>= 3;		/* downscale alpha to 5 bits */
   2178 
   2179 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   2180 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   2181 		/* position alpha to allow for mullo and mulhi on diff channels
   2182 		   to reduce the number of operations */
   2183 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
   2184 
   2185 		/* Setup the 565 color channel masks */
   2186 		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
   2187 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
   2188 
   2189 		while(height--) {
   2190 			DUFFS_LOOP_QUATRO2(
   2191 			{
   2192 				s = *srcp++;
   2193 				d = *dstp;
   2194 				/*
   2195 				 * shift out the middle component (green) to
   2196 				 * the high 16 bits, and process all three RGB
   2197 				 * components at the same time.
   2198 				 */
   2199 				s = (s | s << 16) & 0x07e0f81f;
   2200 				d = (d | d << 16) & 0x07e0f81f;
   2201 				d += (s - d) * alpha >> 5;
   2202 				d &= 0x07e0f81f;
   2203 				*dstp++ = (Uint16)(d | d >> 16);
   2204 			},{
   2205 				s = *srcp++;
   2206 				d = *dstp;
   2207 				/*
   2208 				 * shift out the middle component (green) to
   2209 				 * the high 16 bits, and process all three RGB
   2210 				 * components at the same time.
   2211 				 */
   2212 				s = (s | s << 16) & 0x07e0f81f;
   2213 				d = (d | d << 16) & 0x07e0f81f;
   2214 				d += (s - d) * alpha >> 5;
   2215 				d &= 0x07e0f81f;
   2216 				*dstp++ = (Uint16)(d | d >> 16);
   2217 				s = *srcp++;
   2218 				d = *dstp;
   2219 				/*
   2220 				 * shift out the middle component (green) to
   2221 				 * the high 16 bits, and process all three RGB
   2222 				 * components at the same time.
   2223 				 */
   2224 				s = (s | s << 16) & 0x07e0f81f;
   2225 				d = (d | d << 16) & 0x07e0f81f;
   2226 				d += (s - d) * alpha >> 5;
   2227 				d &= 0x07e0f81f;
   2228 				*dstp++ = (Uint16)(d | d >> 16);
   2229 			},{
   2230 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   2231 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   2232 
   2233 				/* red */
   2234 				src2 = src1;
   2235 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
   2236 
   2237 				dst2 = dst1;
   2238 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
   2239 
   2240 				/* blend */
   2241 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2242 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2243 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   2244 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2245 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
   2246 
   2247 				mm_res = dst2; /* RED -> mm_res */
   2248 
   2249 				/* green -- process the bits in place */
   2250 				src2 = src1;
   2251 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   2252 
   2253 				dst2 = dst1;
   2254 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   2255 
   2256 				/* blend */
   2257 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2258 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2259 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   2260 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2261 
   2262 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   2263 
   2264 				/* blue */
   2265 				src2 = src1;
   2266 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   2267 
   2268 				dst2 = dst1;
   2269 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   2270 
   2271 				/* blend */
   2272 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2273 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2274 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   2275 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2276 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   2277 
   2278 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   2279 
   2280 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   2281 
   2282 				srcp += 4;
   2283 				dstp += 4;
   2284 			}, width);
   2285 			srcp += srcskip;
   2286 			dstp += dstskip;
   2287 		}
   2288 		_mm_empty();
   2289 	}
   2290 }
   2291 
   2292 /* fast RGB555->RGB555 blending with surface alpha */
   2293 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
   2294 {
   2295 	unsigned alpha = info->src->alpha;
   2296 	if(alpha == 128) {
   2297 		Blit16to16SurfaceAlpha128(info, 0xfbde);
   2298 	} else {
   2299 		int width = info->d_width;
   2300 		int height = info->d_height;
   2301 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   2302 		int srcskip = info->s_skip >> 1;
   2303 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   2304 		int dstskip = info->d_skip >> 1;
   2305 		Uint32 s, d;
   2306 
   2307 		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
   2308 
   2309 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
   2310 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
   2311 		alpha >>= 3;		/* downscale alpha to 5 bits */
   2312 
   2313 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   2314 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   2315 		/* position alpha to allow for mullo and mulhi on diff channels
   2316 		   to reduce the number of operations */
   2317 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
   2318 
   2319 		/* Setup the 555 color channel masks */
   2320 		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
   2321 		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
   2322 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
   2323 
   2324 		while(height--) {
   2325 			DUFFS_LOOP_QUATRO2(
   2326 			{
   2327 				s = *srcp++;
   2328 				d = *dstp;
   2329 				/*
   2330 				 * shift out the middle component (green) to
   2331 				 * the high 16 bits, and process all three RGB
   2332 				 * components at the same time.
   2333 				 */
   2334 				s = (s | s << 16) & 0x03e07c1f;
   2335 				d = (d | d << 16) & 0x03e07c1f;
   2336 				d += (s - d) * alpha >> 5;
   2337 				d &= 0x03e07c1f;
   2338 				*dstp++ = (Uint16)(d | d >> 16);
   2339 			},{
   2340 				s = *srcp++;
   2341 				d = *dstp;
   2342 				/*
   2343 				 * shift out the middle component (green) to
   2344 				 * the high 16 bits, and process all three RGB
   2345 				 * components at the same time.
   2346 				 */
   2347 				s = (s | s << 16) & 0x03e07c1f;
   2348 				d = (d | d << 16) & 0x03e07c1f;
   2349 				d += (s - d) * alpha >> 5;
   2350 				d &= 0x03e07c1f;
   2351 				*dstp++ = (Uint16)(d | d >> 16);
   2352 			        s = *srcp++;
   2353 				d = *dstp;
   2354 				/*
   2355 				 * shift out the middle component (green) to
   2356 				 * the high 16 bits, and process all three RGB
   2357 				 * components at the same time.
   2358 				 */
   2359 				s = (s | s << 16) & 0x03e07c1f;
   2360 				d = (d | d << 16) & 0x03e07c1f;
   2361 				d += (s - d) * alpha >> 5;
   2362 				d &= 0x03e07c1f;
   2363 				*dstp++ = (Uint16)(d | d >> 16);
   2364 			},{
   2365 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   2366 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   2367 
   2368 				/* red -- process the bits in place */
   2369 				src2 = src1;
   2370 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
   2371 
   2372 				dst2 = dst1;
   2373 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
   2374 
   2375 				/* blend */
   2376 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2377 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2378 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   2379 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2380 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
   2381 
   2382 				mm_res = dst2; /* RED -> mm_res */
   2383 
   2384 				/* green -- process the bits in place */
   2385 				src2 = src1;
   2386 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   2387 
   2388 				dst2 = dst1;
   2389 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   2390 
   2391 				/* blend */
   2392 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2393 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2394 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   2395 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2396 
   2397 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   2398 
   2399 				/* blue */
   2400 				src2 = src1; /* src -> src2 */
   2401 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   2402 
   2403 				dst2 = dst1; /* dst -> dst2 */
   2404 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   2405 
   2406 				/* blend */
   2407 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2408 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2409 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   2410 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2411 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   2412 
   2413 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   2414 
   2415 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   2416 
   2417 				srcp += 4;
   2418 				dstp += 4;
   2419 			}, width);
   2420 			srcp += srcskip;
   2421 			dstp += dstskip;
   2422 		}
   2423 		_mm_empty();
   2424 	}
   2425 }
   2426 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   2427 
   2428 /* fast RGB565->RGB565 blending with surface alpha */
   2429 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
   2430 {
   2431 	unsigned alpha = info->src->alpha;
   2432 	if(alpha == 128) {
   2433 		Blit16to16SurfaceAlpha128(info, 0xf7de);
   2434 	} else {
   2435 		int width = info->d_width;
   2436 		int height = info->d_height;
   2437 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   2438 		int srcskip = info->s_skip >> 1;
   2439 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   2440 		int dstskip = info->d_skip >> 1;
   2441 		alpha >>= 3;	/* downscale alpha to 5 bits */
   2442 
   2443 		while(height--) {
   2444 			DUFFS_LOOP4({
   2445 				Uint32 s = *srcp++;
   2446 				Uint32 d = *dstp;
   2447 				/*
   2448 				 * shift out the middle component (green) to
   2449 				 * the high 16 bits, and process all three RGB
   2450 				 * components at the same time.
   2451 				 */
   2452 				s = (s | s << 16) & 0x07e0f81f;
   2453 				d = (d | d << 16) & 0x07e0f81f;
   2454 				d += (s - d) * alpha >> 5;
   2455 				d &= 0x07e0f81f;
   2456 				*dstp++ = (Uint16)(d | d >> 16);
   2457 			}, width);
   2458 			srcp += srcskip;
   2459 			dstp += dstskip;
   2460 		}
   2461 	}
   2462 }
   2463 
   2464 /* fast RGB555->RGB555 blending with surface alpha */
   2465 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
   2466 {
   2467 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
   2468 	if(alpha == 128) {
   2469 		Blit16to16SurfaceAlpha128(info, 0xfbde);
   2470 	} else {
   2471 		int width = info->d_width;
   2472 		int height = info->d_height;
   2473 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   2474 		int srcskip = info->s_skip >> 1;
   2475 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   2476 		int dstskip = info->d_skip >> 1;
   2477 		alpha >>= 3;		/* downscale alpha to 5 bits */
   2478 
   2479 		while(height--) {
   2480 			DUFFS_LOOP4({
   2481 				Uint32 s = *srcp++;
   2482 				Uint32 d = *dstp;
   2483 				/*
   2484 				 * shift out the middle component (green) to
   2485 				 * the high 16 bits, and process all three RGB
   2486 				 * components at the same time.
   2487 				 */
   2488 				s = (s | s << 16) & 0x03e07c1f;
   2489 				d = (d | d << 16) & 0x03e07c1f;
   2490 				d += (s - d) * alpha >> 5;
   2491 				d &= 0x03e07c1f;
   2492 				*dstp++ = (Uint16)(d | d >> 16);
   2493 			}, width);
   2494 			srcp += srcskip;
   2495 			dstp += dstskip;
   2496 		}
   2497 	}
   2498 }
   2499 
   2500 /* fast ARGB8888->RGB565 blending with pixel alpha */
   2501 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
   2502 {
   2503 	int width = info->d_width;
   2504 	int height = info->d_height;
   2505 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   2506 	int srcskip = info->s_skip >> 2;
   2507 	Uint16 *dstp = (Uint16 *)info->d_pixels;
   2508 	int dstskip = info->d_skip >> 1;
   2509 
   2510 	while(height--) {
   2511 	    DUFFS_LOOP4({
   2512 		Uint32 s = *srcp;
   2513 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
   2514 		/* FIXME: Here we special-case opaque alpha since the
   2515 		   compositioning used (>>8 instead of /255) doesn't handle
   2516 		   it correctly. Also special-case alpha=0 for speed?
   2517 		   Benchmark this! */
   2518 		if(alpha) {
   2519 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
   2520 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
   2521 		  } else {
   2522 		    Uint32 d = *dstp;
   2523 		    /*
   2524 		     * convert source and destination to G0RAB65565
   2525 		     * and blend all components at the same time
   2526 		     */
   2527 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
   2528 		      + (s >> 3 & 0x1f);
   2529 		    d = (d | d << 16) & 0x07e0f81f;
   2530 		    d += (s - d) * alpha >> 5;
   2531 		    d &= 0x07e0f81f;
   2532 		    *dstp = (Uint16)(d | d >> 16);
   2533 		  }
   2534 		}
   2535 		srcp++;
   2536 		dstp++;
   2537 	    }, width);
   2538 	    srcp += srcskip;
   2539 	    dstp += dstskip;
   2540 	}
   2541 }
   2542 
   2543 /* fast ARGB8888->RGB555 blending with pixel alpha */
   2544 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
   2545 {
   2546 	int width = info->d_width;
   2547 	int height = info->d_height;
   2548 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   2549 	int srcskip = info->s_skip >> 2;
   2550 	Uint16 *dstp = (Uint16 *)info->d_pixels;
   2551 	int dstskip = info->d_skip >> 1;
   2552 
   2553 	while(height--) {
   2554 	    DUFFS_LOOP4({
   2555 		unsigned alpha;
   2556 		Uint32 s = *srcp;
   2557 		alpha = s >> 27; /* downscale alpha to 5 bits */
   2558 		/* FIXME: Here we special-case opaque alpha since the
   2559 		   compositioning used (>>8 instead of /255) doesn't handle
   2560 		   it correctly. Also special-case alpha=0 for speed?
   2561 		   Benchmark this! */
   2562 		if(alpha) {
   2563 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
   2564 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
   2565 		  } else {
   2566 		    Uint32 d = *dstp;
   2567 		    /*
   2568 		     * convert source and destination to G0RAB65565
   2569 		     * and blend all components at the same time
   2570 		     */
   2571 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
   2572 		      + (s >> 3 & 0x1f);
   2573 		    d = (d | d << 16) & 0x03e07c1f;
   2574 		    d += (s - d) * alpha >> 5;
   2575 		    d &= 0x03e07c1f;
   2576 		    *dstp = (Uint16)(d | d >> 16);
   2577 		  }
   2578 		}
   2579 		srcp++;
   2580 		dstp++;
   2581 	    }, width);
   2582 	    srcp += srcskip;
   2583 	    dstp += dstskip;
   2584 	}
   2585 }
   2586 
   2587 /* General (slow) N->N blending with per-surface alpha */
   2588 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
   2589 {
   2590 	int width = info->d_width;
   2591 	int height = info->d_height;
   2592 	Uint8 *src = info->s_pixels;
   2593 	int srcskip = info->s_skip;
   2594 	Uint8 *dst = info->d_pixels;
   2595 	int dstskip = info->d_skip;
   2596 	SDL_PixelFormat *srcfmt = info->src;
   2597 	SDL_PixelFormat *dstfmt = info->dst;
   2598 	int srcbpp = srcfmt->BytesPerPixel;
   2599 	int dstbpp = dstfmt->BytesPerPixel;
   2600 	unsigned sA = srcfmt->alpha;
   2601 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   2602 
   2603 	if(sA) {
   2604 	  while ( height-- ) {
   2605 	    DUFFS_LOOP4(
   2606 	    {
   2607 		Uint32 Pixel;
   2608 		unsigned sR;
   2609 		unsigned sG;
   2610 		unsigned sB;
   2611 		unsigned dR;
   2612 		unsigned dG;
   2613 		unsigned dB;
   2614 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   2615 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
   2616 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   2617 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   2618 		src += srcbpp;
   2619 		dst += dstbpp;
   2620 	    },
   2621 	    width);
   2622 	    src += srcskip;
   2623 	    dst += dstskip;
   2624 	  }
   2625 	}
   2626 }
   2627 
   2628 /* General (slow) colorkeyed N->N blending with per-surface alpha */
   2629 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
   2630 {
   2631 	int width = info->d_width;
   2632 	int height = info->d_height;
   2633 	Uint8 *src = info->s_pixels;
   2634 	int srcskip = info->s_skip;
   2635 	Uint8 *dst = info->d_pixels;
   2636 	int dstskip = info->d_skip;
   2637 	SDL_PixelFormat *srcfmt = info->src;
   2638 	SDL_PixelFormat *dstfmt = info->dst;
   2639 	Uint32 ckey = srcfmt->colorkey;
   2640 	int srcbpp = srcfmt->BytesPerPixel;
   2641 	int dstbpp = dstfmt->BytesPerPixel;
   2642 	unsigned sA = srcfmt->alpha;
   2643 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   2644 
   2645 	while ( height-- ) {
   2646 	    DUFFS_LOOP4(
   2647 	    {
   2648 		Uint32 Pixel;
   2649 		unsigned sR;
   2650 		unsigned sG;
   2651 		unsigned sB;
   2652 		unsigned dR;
   2653 		unsigned dG;
   2654 		unsigned dB;
   2655 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
   2656 		if(sA && Pixel != ckey) {
   2657 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
   2658 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
   2659 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   2660 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   2661 		}
   2662 		src += srcbpp;
   2663 		dst += dstbpp;
   2664 	    },
   2665 	    width);
   2666 	    src += srcskip;
   2667 	    dst += dstskip;
   2668 	}
   2669 }
   2670 
   2671 /* General (slow) N->N blending with pixel alpha */
   2672 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
   2673 {
   2674 	int width = info->d_width;
   2675 	int height = info->d_height;
   2676 	Uint8 *src = info->s_pixels;
   2677 	int srcskip = info->s_skip;
   2678 	Uint8 *dst = info->d_pixels;
   2679 	int dstskip = info->d_skip;
   2680 	SDL_PixelFormat *srcfmt = info->src;
   2681 	SDL_PixelFormat *dstfmt = info->dst;
   2682 
   2683 	int  srcbpp;
   2684 	int  dstbpp;
   2685 
   2686 	/* Set up some basic variables */
   2687 	srcbpp = srcfmt->BytesPerPixel;
   2688 	dstbpp = dstfmt->BytesPerPixel;
   2689 
   2690 	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
   2691 	   quite right. for <8bpp source alpha, it gets them very wrong
   2692 	   (check all macros!)
   2693 	   It is unclear whether there is a good general solution that doesn't
   2694 	   need a branch (or a divide). */
   2695 	while ( height-- ) {
   2696 	    DUFFS_LOOP4(
   2697 	    {
   2698 		Uint32 Pixel;
   2699 		unsigned sR;
   2700 		unsigned sG;
   2701 		unsigned sB;
   2702 		unsigned dR;
   2703 		unsigned dG;
   2704 		unsigned dB;
   2705 		unsigned sA;
   2706 		unsigned dA;
   2707 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
   2708 		if(sA) {
   2709 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
   2710 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   2711 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   2712 		}
   2713 		src += srcbpp;
   2714 		dst += dstbpp;
   2715 	    },
   2716 	    width);
   2717 	    src += srcskip;
   2718 	    dst += dstskip;
   2719 	}
   2720 }
   2721 
   2722 
   2723 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
   2724 {
   2725     SDL_PixelFormat *sf = surface->format;
   2726     SDL_PixelFormat *df = surface->map->dst->format;
   2727 
   2728     if(sf->Amask == 0) {
   2729 	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
   2730 	    if(df->BytesPerPixel == 1)
   2731 		return BlitNto1SurfaceAlphaKey;
   2732 	    else
   2733 #if SDL_ALTIVEC_BLITTERS
   2734 	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
   2735 	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
   2736             return Blit32to32SurfaceAlphaKeyAltivec;
   2737         else
   2738 #endif
   2739             return BlitNtoNSurfaceAlphaKey;
   2740 	} else {
   2741 	    /* Per-surface alpha blits */
   2742 	    switch(df->BytesPerPixel) {
   2743 	    case 1:
   2744 		return BlitNto1SurfaceAlpha;
   2745 
   2746 	    case 2:
   2747 		if(surface->map->identity) {
   2748 		    if(df->Gmask == 0x7e0)
   2749 		    {
   2750 #if MMX_ASMBLIT
   2751 		if(SDL_HasMMX())
   2752 			return Blit565to565SurfaceAlphaMMX;
   2753 		else
   2754 #endif
   2755 			return Blit565to565SurfaceAlpha;
   2756 		    }
   2757 		    else if(df->Gmask == 0x3e0)
   2758 		    {
   2759 #if MMX_ASMBLIT
   2760 		if(SDL_HasMMX())
   2761 			return Blit555to555SurfaceAlphaMMX;
   2762 		else
   2763 #endif
   2764 			return Blit555to555SurfaceAlpha;
   2765 		    }
   2766 		}
   2767 		return BlitNtoNSurfaceAlpha;
   2768 
   2769 	    case 4:
   2770 		if(sf->Rmask == df->Rmask
   2771 		   && sf->Gmask == df->Gmask
   2772 		   && sf->Bmask == df->Bmask
   2773 		   && sf->BytesPerPixel == 4)
   2774 		{
   2775 #if MMX_ASMBLIT
   2776 			if(sf->Rshift % 8 == 0
   2777 			   && sf->Gshift % 8 == 0
   2778 			   && sf->Bshift % 8 == 0
   2779 			   && SDL_HasMMX())
   2780 			    return BlitRGBtoRGBSurfaceAlphaMMX;
   2781 #endif
   2782 			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
   2783 			{
   2784 #if SDL_ALTIVEC_BLITTERS
   2785 				if(!(surface->map->dst->flags & SDL_HWSURFACE)
   2786 					&& SDL_HasAltiVec())
   2787 					return BlitRGBtoRGBSurfaceAlphaAltivec;
   2788 #endif
   2789 				return BlitRGBtoRGBSurfaceAlpha;
   2790 			}
   2791 		}
   2792 #if SDL_ALTIVEC_BLITTERS
   2793 		if((sf->BytesPerPixel == 4) &&
   2794 		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
   2795 			return Blit32to32SurfaceAlphaAltivec;
   2796 		else
   2797 #endif
   2798 			return BlitNtoNSurfaceAlpha;
   2799 
   2800 	    case 3:
   2801 	    default:
   2802 		return BlitNtoNSurfaceAlpha;
   2803 	    }
   2804 	}
   2805     } else {
   2806 	/* Per-pixel alpha blits */
   2807 	switch(df->BytesPerPixel) {
   2808 	case 1:
   2809 	    return BlitNto1PixelAlpha;
   2810 
   2811 	case 2:
   2812 #if SDL_ALTIVEC_BLITTERS
   2813 	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
   2814            df->Gmask == 0x7e0 &&
   2815 	   df->Bmask == 0x1f && SDL_HasAltiVec())
   2816             return Blit32to565PixelAlphaAltivec;
   2817         else
   2818 #endif
   2819 	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
   2820 	       && sf->Gmask == 0xff00
   2821 	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
   2822 		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
   2823 		if(df->Gmask == 0x7e0)
   2824 		    return BlitARGBto565PixelAlpha;
   2825 		else if(df->Gmask == 0x3e0)
   2826 		    return BlitARGBto555PixelAlpha;
   2827 	    }
   2828 	    return BlitNtoNPixelAlpha;
   2829 
   2830 	case 4:
   2831 	    if(sf->Rmask == df->Rmask
   2832 	       && sf->Gmask == df->Gmask
   2833 	       && sf->Bmask == df->Bmask
   2834 	       && sf->BytesPerPixel == 4)
   2835 	    {
   2836 #if MMX_ASMBLIT
   2837 		if(sf->Rshift % 8 == 0
   2838 		   && sf->Gshift % 8 == 0
   2839 		   && sf->Bshift % 8 == 0
   2840 		   && sf->Ashift % 8 == 0
   2841 		   && sf->Aloss == 0)
   2842 		{
   2843 			if(SDL_Has3DNow())
   2844 				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
   2845 			if(SDL_HasMMX())
   2846 				return BlitRGBtoRGBPixelAlphaMMX;
   2847 		}
   2848 #endif
   2849 		if(sf->Amask == 0xff000000)
   2850 		{
   2851 #if SDL_ALTIVEC_BLITTERS
   2852 			if(!(surface->map->dst->flags & SDL_HWSURFACE)
   2853 				&& SDL_HasAltiVec())
   2854 				return BlitRGBtoRGBPixelAlphaAltivec;
   2855 #endif
   2856 			return BlitRGBtoRGBPixelAlpha;
   2857 		}
   2858 	    }
   2859 #if SDL_ALTIVEC_BLITTERS
   2860 	    if (sf->Amask && sf->BytesPerPixel == 4 &&
   2861 	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
   2862 		return Blit32to32PixelAlphaAltivec;
   2863 	    else
   2864 #endif
   2865 		return BlitNtoNPixelAlpha;
   2866 
   2867 	case 3:
   2868 	default:
   2869 	    return BlitNtoNPixelAlpha;
   2870 	}
   2871     }
   2872 }
   2873 
   2874