Home | History | Annotate | Download | only in video
      1 /*
      2     SDL - Simple DirectMedia Layer
      3     Copyright (C) 1997-2006 Sam Lantinga
      4 
      5     This library is free software; you can redistribute it and/or
      6     modify it under the terms of the GNU Lesser General Public
      7     License as published by the Free Software Foundation; either
      8     version 2.1 of the License, or (at your option) any later version.
      9 
     10     This library is distributed in the hope that it will be useful,
     11     but WITHOUT ANY WARRANTY; without even the implied warranty of
     12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13     Lesser General Public License for more details.
     14 
     15     You should have received a copy of the GNU Lesser General Public
     16     License along with this library; if not, write to the Free Software
     17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
     18 
     19     Sam Lantinga
     20     slouken (at) libsdl.org
     21 */
     22 #include "SDL_config.h"
     23 
     24 #include "SDL_video.h"
     25 #include "SDL_blit.h"
     26 
     27 /*
     28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
     29    Checking if _mm_free is #defined in malloc.h is is the only way to
     30    determine if the Processor Pack is installed, as far as I can tell.
     31 */
     32 
     33 #if SDL_ASSEMBLY_ROUTINES
     34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
     35 #    define MMX_ASMBLIT 1
     36 #    define GCC_ASMBLIT 1
     37 #  elif defined(_MSC_VER) && defined(_M_IX86)
     38 #    if (_MSC_VER <= 1200)
     39 #      include <malloc.h>
     40 #      if defined(_mm_free)
     41 #          define HAVE_MMINTRIN_H 1
     42 #      endif
     43 #    else  /* Visual Studio > VC6 always has mmintrin.h */
     44 #      define HAVE_MMINTRIN_H 1
     45 #    endif
     46 #    if HAVE_MMINTRIN_H
     47 #      define MMX_ASMBLIT 1
     48 #      define MSVC_ASMBLIT 1
     49 #    endif
     50 #  endif
     51 #endif /* SDL_ASSEMBLY_ROUTINES */
     52 
     53 /* Function to check the CPU flags */
     54 #include "SDL_cpuinfo.h"
     55 #if GCC_ASMBLIT
     56 #include "mmx.h"
     57 #elif MSVC_ASMBLIT
     58 #include <mmintrin.h>
     59 #include <mm3dnow.h>
     60 #endif
     61 
     62 /* Functions to perform alpha blended blitting */
     63 
     64 /* N->1 blending with per-surface alpha */
     65 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
     66 {
     67 	int width = info->d_width;
     68 	int height = info->d_height;
     69 	Uint8 *src = info->s_pixels;
     70 	int srcskip = info->s_skip;
     71 	Uint8 *dst = info->d_pixels;
     72 	int dstskip = info->d_skip;
     73 	Uint8 *palmap = info->table;
     74 	SDL_PixelFormat *srcfmt = info->src;
     75 	SDL_PixelFormat *dstfmt = info->dst;
     76 	int srcbpp = srcfmt->BytesPerPixel;
     77 
     78 	const unsigned A = srcfmt->alpha;
     79 
     80 	while ( height-- ) {
     81 	    DUFFS_LOOP4(
     82 	    {
     83 		Uint32 Pixel;
     84 		unsigned sR;
     85 		unsigned sG;
     86 		unsigned sB;
     87 		unsigned dR;
     88 		unsigned dG;
     89 		unsigned dB;
     90 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
     91 		dR = dstfmt->palette->colors[*dst].r;
     92 		dG = dstfmt->palette->colors[*dst].g;
     93 		dB = dstfmt->palette->colors[*dst].b;
     94 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
     95 		dR &= 0xff;
     96 		dG &= 0xff;
     97 		dB &= 0xff;
     98 		/* Pack RGB into 8bit pixel */
     99 		if ( palmap == NULL ) {
    100 		    *dst =((dR>>5)<<(3+2))|
    101 			  ((dG>>5)<<(2))|
    102 			  ((dB>>6)<<(0));
    103 		} else {
    104 		    *dst = palmap[((dR>>5)<<(3+2))|
    105 				  ((dG>>5)<<(2))  |
    106 				  ((dB>>6)<<(0))];
    107 		}
    108 		dst++;
    109 		src += srcbpp;
    110 	    },
    111 	    width);
    112 	    src += srcskip;
    113 	    dst += dstskip;
    114 	}
    115 }
    116 
    117 /* N->1 blending with pixel alpha */
    118 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
    119 {
    120 	int width = info->d_width;
    121 	int height = info->d_height;
    122 	Uint8 *src = info->s_pixels;
    123 	int srcskip = info->s_skip;
    124 	Uint8 *dst = info->d_pixels;
    125 	int dstskip = info->d_skip;
    126 	Uint8 *palmap = info->table;
    127 	SDL_PixelFormat *srcfmt = info->src;
    128 	SDL_PixelFormat *dstfmt = info->dst;
    129 	int srcbpp = srcfmt->BytesPerPixel;
    130 
    131 	/* FIXME: fix alpha bit field expansion here too? */
    132 	while ( height-- ) {
    133 	    DUFFS_LOOP4(
    134 	    {
    135 		Uint32 Pixel;
    136 		unsigned sR;
    137 		unsigned sG;
    138 		unsigned sB;
    139 		unsigned sA;
    140 		unsigned dR;
    141 		unsigned dG;
    142 		unsigned dB;
    143 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
    144 		dR = dstfmt->palette->colors[*dst].r;
    145 		dG = dstfmt->palette->colors[*dst].g;
    146 		dB = dstfmt->palette->colors[*dst].b;
    147 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
    148 		dR &= 0xff;
    149 		dG &= 0xff;
    150 		dB &= 0xff;
    151 		/* Pack RGB into 8bit pixel */
    152 		if ( palmap == NULL ) {
    153 		    *dst =((dR>>5)<<(3+2))|
    154 			  ((dG>>5)<<(2))|
    155 			  ((dB>>6)<<(0));
    156 		} else {
    157 		    *dst = palmap[((dR>>5)<<(3+2))|
    158 				  ((dG>>5)<<(2))  |
    159 				  ((dB>>6)<<(0))  ];
    160 		}
    161 		dst++;
    162 		src += srcbpp;
    163 	    },
    164 	    width);
    165 	    src += srcskip;
    166 	    dst += dstskip;
    167 	}
    168 }
    169 
    170 /* colorkeyed N->1 blending with per-surface alpha */
    171 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
    172 {
    173 	int width = info->d_width;
    174 	int height = info->d_height;
    175 	Uint8 *src = info->s_pixels;
    176 	int srcskip = info->s_skip;
    177 	Uint8 *dst = info->d_pixels;
    178 	int dstskip = info->d_skip;
    179 	Uint8 *palmap = info->table;
    180 	SDL_PixelFormat *srcfmt = info->src;
    181 	SDL_PixelFormat *dstfmt = info->dst;
    182 	int srcbpp = srcfmt->BytesPerPixel;
    183 	Uint32 ckey = srcfmt->colorkey;
    184 
    185 	const int A = srcfmt->alpha;
    186 
    187 	while ( height-- ) {
    188 	    DUFFS_LOOP(
    189 	    {
    190 		Uint32 Pixel;
    191 		unsigned sR;
    192 		unsigned sG;
    193 		unsigned sB;
    194 		unsigned dR;
    195 		unsigned dG;
    196 		unsigned dB;
    197 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    198 		if ( Pixel != ckey ) {
    199 		    dR = dstfmt->palette->colors[*dst].r;
    200 		    dG = dstfmt->palette->colors[*dst].g;
    201 		    dB = dstfmt->palette->colors[*dst].b;
    202 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    203 		    dR &= 0xff;
    204 		    dG &= 0xff;
    205 		    dB &= 0xff;
    206 		    /* Pack RGB into 8bit pixel */
    207 		    if ( palmap == NULL ) {
    208 			*dst =((dR>>5)<<(3+2))|
    209 			      ((dG>>5)<<(2)) |
    210 			      ((dB>>6)<<(0));
    211 		    } else {
    212 			*dst = palmap[((dR>>5)<<(3+2))|
    213 				      ((dG>>5)<<(2))  |
    214 				      ((dB>>6)<<(0))  ];
    215 		    }
    216 		}
    217 		dst++;
    218 		src += srcbpp;
    219 	    },
    220 	    width);
    221 	    src += srcskip;
    222 	    dst += dstskip;
    223 	}
    224 }
    225 
    226 #if GCC_ASMBLIT
    227 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    228 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
    229 {
    230 	int width = info->d_width;
    231 	int height = info->d_height;
    232 	Uint32 *srcp = (Uint32 *)info->s_pixels;
    233 	int srcskip = info->s_skip >> 2;
    234 	Uint32 *dstp = (Uint32 *)info->d_pixels;
    235 	int dstskip = info->d_skip >> 2;
    236 	Uint32 dalpha = info->dst->Amask;
    237 	Uint8 load[8];
    238 
    239 	*(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */
    240 	movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
    241 	*(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */
    242 	movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
    243 	movd_m2r(dalpha, mm7); /* dst alpha mask */
    244 	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
    245 	while(height--) {
    246 		DUFFS_LOOP_DOUBLE2(
    247 		{
    248 			Uint32 s = *srcp++;
    249 			Uint32 d = *dstp;
    250 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    251 				   + (s & d & 0x00010101)) | dalpha;
    252 		},{
    253 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
    254 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
    255 
    256 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
    257 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
    258 
    259 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
    260 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
    261 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
    262 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
    263 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
    264 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
    265 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
    266 
    267 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
    268 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
    269 			dstp += 2;
    270 			srcp += 2;
    271 		}, width);
    272 		srcp += srcskip;
    273 		dstp += dstskip;
    274 	}
    275 	emms();
    276 }
    277 
    278 /* fast RGB888->(A)RGB888 blending with surface alpha */
    279 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
    280 {
    281 	SDL_PixelFormat* df = info->dst;
    282 	unsigned alpha = info->src->alpha;
    283 
    284 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
    285 			/* only call a128 version when R,G,B occupy lower bits */
    286 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
    287 	} else {
    288 		int width = info->d_width;
    289 		int height = info->d_height;
    290 		Uint32 *srcp = (Uint32 *)info->s_pixels;
    291 		int srcskip = info->s_skip >> 2;
    292 		Uint32 *dstp = (Uint32 *)info->d_pixels;
    293 		int dstskip = info->d_skip >> 2;
    294 
    295 		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
    296 		/* form the alpha mult */
    297 		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
    298 		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
    299 		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
    300 		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
    301 		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
    302 		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
    303 		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
    304 			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
    305 		movd_m2r(df->Amask, mm7); /* dst alpha mask */
    306 		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
    307 
    308 		while(height--) {
    309 			DUFFS_LOOP_DOUBLE2({
    310 				/* One Pixel Blend */
    311 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
    312 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
    313 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
    314 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
    315 
    316 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
    317 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
    318 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
    319 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
    320 
    321 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
    322 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
    323 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
    324 				++srcp;
    325 				++dstp;
    326 			},{
    327 				/* Two Pixels Blend */
    328 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
    329 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
    330 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
    331 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
    332 
    333 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
    334 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
    335 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
    336 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
    337 
    338 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
    339 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
    340 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
    341 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
    342 
    343 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
    344 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
    345 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
    346 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
    347 
    348 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
    349 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
    350 
    351 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
    352 
    353   				srcp += 2;
    354   				dstp += 2;
    355   			}, width);
    356 			srcp += srcskip;
    357 			dstp += dstskip;
    358 		}
    359 		emms();
    360 	}
    361 }
    362 
    363 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
    364 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
    365 {
    366 	int width = info->d_width;
    367 	int height = info->d_height;
    368 	Uint32 *srcp = (Uint32 *)info->s_pixels;
    369 	int srcskip = info->s_skip >> 2;
    370 	Uint32 *dstp = (Uint32 *)info->d_pixels;
    371 	int dstskip = info->d_skip >> 2;
    372 	SDL_PixelFormat* sf = info->src;
    373 	Uint32 amask = sf->Amask;
    374 
    375 	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
    376 	/* form multiplication mask */
    377 	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
    378 	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
    379 	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
    380 	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
    381 	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
    382 	/* form channel masks */
    383 	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
    384 	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
    385 	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
    386 	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
    387 	/* get alpha channel shift */
    388 	__asm__ __volatile__ (
    389 		"movd %0, %%mm5"
    390 		: : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
    391 
    392 	while(height--) {
    393 	    DUFFS_LOOP4({
    394 		Uint32 alpha = *srcp & amask;
    395 		/* FIXME: Here we special-case opaque alpha since the
    396 			compositioning used (>>8 instead of /255) doesn't handle
    397 			it correctly. Also special-case alpha=0 for speed?
    398 			Benchmark this! */
    399 		if(alpha == 0) {
    400 			/* do nothing */
    401 		} else if(alpha == amask) {
    402 			/* opaque alpha -- copy RGB, keep dst alpha */
    403 			/* using MMX here to free up regular registers for other things */
    404 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
    405 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
    406 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
    407 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
    408 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
    409 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
    410 		} else {
    411 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
    412 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
    413 
    414 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
    415 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
    416 
    417 			__asm__ __volatile__ (
    418 				"movd %0, %%mm4"
    419 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
    420 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
    421 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
    422 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
    423 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
    424 
    425 			/* blend */
    426 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
    427 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
    428 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
    429 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
    430 
    431 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
    432 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
    433 		}
    434 		++srcp;
    435 		++dstp;
    436 	    }, width);
    437 	    srcp += srcskip;
    438 	    dstp += dstskip;
    439 	}
    440 	emms();
    441 }
    442 /* End GCC_ASMBLIT */
    443 
    444 #elif MSVC_ASMBLIT
    445 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    446 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
    447 {
    448 	int width = info->d_width;
    449 	int height = info->d_height;
    450 	Uint32 *srcp = (Uint32 *)info->s_pixels;
    451 	int srcskip = info->s_skip >> 2;
    452 	Uint32 *dstp = (Uint32 *)info->d_pixels;
    453 	int dstskip = info->d_skip >> 2;
    454 	Uint32 dalpha = info->dst->Amask;
    455 
    456 	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
    457 
    458 	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
    459 	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
    460 	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
    461 
    462 	while (height--) {
    463 		int n = width;
    464 		if ( n & 1 ) {
    465 			Uint32 s = *srcp++;
    466 			Uint32 d = *dstp;
    467 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    468 				   + (s & d & 0x00010101)) | dalpha;
    469 			n--;
    470 		}
    471 
    472 		for (n >>= 1; n > 0; --n) {
    473 			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
    474 			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
    475 
    476 			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
    477 			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
    478 
    479 			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
    480 			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
    481 			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
    482 			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
    483 
    484 			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
    485 			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
    486 			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
    487 			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
    488 
    489 			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
    490 			dstp += 2;
    491 			srcp += 2;
    492 		}
    493 
    494 		srcp += srcskip;
    495 		dstp += dstskip;
    496 	}
    497 	_mm_empty();
    498 }
    499 
    500 /* fast RGB888->(A)RGB888 blending with surface alpha */
    501 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
    502 {
    503 	SDL_PixelFormat* df = info->dst;
    504 	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
    505 	unsigned alpha = info->src->alpha;
    506 
    507 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
    508 			/* only call a128 version when R,G,B occupy lower bits */
    509 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
    510 	} else {
    511 		int width = info->d_width;
    512 		int height = info->d_height;
    513 		Uint32 *srcp = (Uint32 *)info->s_pixels;
    514 		int srcskip = info->s_skip >> 2;
    515 		Uint32 *dstp = (Uint32 *)info->d_pixels;
    516 		int dstskip = info->d_skip >> 2;
    517 		Uint32 dalpha = df->Amask;
    518 		Uint32 amult;
    519 
    520 		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
    521 
    522 		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
    523 		/* form the alpha mult */
    524 		amult = alpha | (alpha << 8);
    525 		amult = amult | (amult << 16);
    526 		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
    527 		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
    528 		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
    529 			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
    530 		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
    531 
    532 		while (height--) {
    533 			int n = width;
    534 			if (n & 1) {
    535 				/* One Pixel Blend */
    536 				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
    537 				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
    538 
    539 				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
    540 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
    541 
    542 				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
    543 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    544 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
    545 				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
    546 
    547 				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
    548 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
    549 				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    550 
    551 				++srcp;
    552 				++dstp;
    553 
    554 				n--;
    555 			}
    556 
    557 			for (n >>= 1; n > 0; --n) {
    558 				/* Two Pixels Blend */
    559 				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
    560 				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
    561 				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
    562 				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
    563 
    564 				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
    565 				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
    566 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
    567 				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
    568 
    569 				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
    570 				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
    571 				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
    572 				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
    573 
    574 				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
    575 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    576 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
    577 				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
    578 
    579 				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
    580 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
    581 
    582 				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
    583 
    584 				srcp += 2;
    585 				dstp += 2;
    586 			}
    587 			srcp += srcskip;
    588 			dstp += dstskip;
    589 		}
    590 		_mm_empty();
    591 	}
    592 }
    593 
    594 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
    595 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
    596 {
    597 	int width = info->d_width;
    598 	int height = info->d_height;
    599 	Uint32 *srcp = (Uint32 *)info->s_pixels;
    600 	int srcskip = info->s_skip >> 2;
    601 	Uint32 *dstp = (Uint32 *)info->d_pixels;
    602 	int dstskip = info->d_skip >> 2;
    603 	SDL_PixelFormat* sf = info->src;
    604 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
    605 	Uint32 amask = sf->Amask;
    606 	Uint32 ashift = sf->Ashift;
    607 	Uint64 multmask;
    608 
    609 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
    610 
    611 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
    612 	multmask = ~(0xFFFFi64 << (ashift * 2));
    613 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
    614 
    615 	while(height--) {
    616 		DUFFS_LOOP4({
    617 		Uint32 alpha = *srcp & amask;
    618 		if (alpha == 0) {
    619 			/* do nothing */
    620 		} else if (alpha == amask) {
    621 			/* opaque alpha -- copy RGB, keep dst alpha */
    622 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
    623 		} else {
    624 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
    625 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
    626 
    627 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
    628 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
    629 
    630 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
    631 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
    632 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
    633 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
    634 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
    635 
    636 			/* blend */
    637 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
    638 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
    639 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
    640 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
    641 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
    642 
    643 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    644 		}
    645 		++srcp;
    646 		++dstp;
    647 	    }, width);
    648 	    srcp += srcskip;
    649 	    dstp += dstskip;
    650 	}
    651 	_mm_empty();
    652 }
    653 /* End MSVC_ASMBLIT */
    654 
    655 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
    656 
    657 #if SDL_ALTIVEC_BLITTERS
    658 #if __MWERKS__
    659 #pragma altivec_model on
    660 #endif
    661 #if HAVE_ALTIVEC_H
    662 #include <altivec.h>
    663 #endif
    664 #include <assert.h>
    665 
    666 #if (defined(__MACOSX__) && (__GNUC__ < 4))
    667     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
    668         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
    669     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
    670         (vector unsigned short) ( a,b,c,d,e,f,g,h )
    671 #else
    672     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
    673         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
    674     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
    675         (vector unsigned short) { a,b,c,d,e,f,g,h }
    676 #endif
    677 
    678 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
    679 #define VECPRINT(msg, v) do { \
    680     vector unsigned int tmpvec = (vector unsigned int)(v); \
    681     unsigned int *vp = (unsigned int *)&tmpvec; \
    682     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
    683 } while (0)
    684 
    685 /* the permuation vector that takes the high bytes out of all the appropriate shorts
    686     (vector unsigned char)(
    687         0x00, 0x10, 0x02, 0x12,
    688         0x04, 0x14, 0x06, 0x16,
    689         0x08, 0x18, 0x0A, 0x1A,
    690         0x0C, 0x1C, 0x0E, 0x1E );
    691 */
    692 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
    693 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
    694 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
    695 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
    696     ? vec_lvsl(0, src) \
    697     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
    698 
    699 
    700 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
    701     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
    702     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
    703     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
    704     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
    705     /* valpha2 is 255-alpha */ \
    706     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
    707     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
    708     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
    709     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
    710     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
    711     /* add source and dest */ \
    712     vtemp1 = vec_add(vtemp1, vtemp3); \
    713     vtemp2 = vec_add(vtemp2, vtemp4); \
    714     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
    715     vtemp1 = vec_add(vtemp1, v1_16); \
    716     vtemp3 = vec_sr(vtemp1, v8_16); \
    717     vtemp1 = vec_add(vtemp1, vtemp3); \
    718     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
    719     vtemp2 = vec_add(vtemp2, v1_16); \
    720     vtemp4 = vec_sr(vtemp2, v8_16); \
    721     vtemp2 = vec_add(vtemp2, vtemp4); \
    722     /* (>>8) and get ARGBARGBARGBARGB */ \
    723     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
    724 } while (0)
    725 
    726 /* Calculate the permute vector used for 32->32 swizzling */
    727 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
    728                                   const SDL_PixelFormat *dstfmt)
    729 {
    730     /*
    731      * We have to assume that the bits that aren't used by other
    732      *  colors is alpha, and it's one complete byte, since some formats
    733      *  leave alpha with a zero mask, but we should still swizzle the bits.
    734      */
    735     /* ARGB */
    736     const static struct SDL_PixelFormat default_pixel_format = {
    737         NULL, 0, 0,
    738         0, 0, 0, 0,
    739         16, 8, 0, 24,
    740         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
    741         0, 0};
    742     if (!srcfmt) {
    743         srcfmt = &default_pixel_format;
    744     }
    745     if (!dstfmt) {
    746         dstfmt = &default_pixel_format;
    747     }
    748     const vector unsigned char plus = VECUINT8_LITERAL
    749                                             ( 0x00, 0x00, 0x00, 0x00,
    750                                               0x04, 0x04, 0x04, 0x04,
    751                                               0x08, 0x08, 0x08, 0x08,
    752                                               0x0C, 0x0C, 0x0C, 0x0C );
    753     vector unsigned char vswiz;
    754     vector unsigned int srcvec;
    755 #define RESHIFT(X) (3 - ((X) >> 3))
    756     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
    757     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
    758     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
    759     Uint32 amask;
    760     /* Use zero for alpha if either surface doesn't have alpha */
    761     if (dstfmt->Amask) {
    762         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
    763     } else {
    764         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
    765     }
    766 #undef RESHIFT
    767     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
    768     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
    769     return(vswiz);
    770 }
    771 
    772 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
    773 {
    774     int height = info->d_height;
    775     Uint8 *src = (Uint8 *)info->s_pixels;
    776     int srcskip = info->s_skip;
    777     Uint8 *dst = (Uint8 *)info->d_pixels;
    778     int dstskip = info->d_skip;
    779     SDL_PixelFormat *srcfmt = info->src;
    780 
    781     vector unsigned char v0 = vec_splat_u8(0);
    782     vector unsigned short v8_16 = vec_splat_u16(8);
    783     vector unsigned short v1_16 = vec_splat_u16(1);
    784     vector unsigned short v2_16 = vec_splat_u16(2);
    785     vector unsigned short v3_16 = vec_splat_u16(3);
    786     vector unsigned int v8_32 = vec_splat_u32(8);
    787     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
    788     vector unsigned short v3f = VECUINT16_LITERAL(
    789         0x003f, 0x003f, 0x003f, 0x003f,
    790         0x003f, 0x003f, 0x003f, 0x003f);
    791     vector unsigned short vfc = VECUINT16_LITERAL(
    792         0x00fc, 0x00fc, 0x00fc, 0x00fc,
    793         0x00fc, 0x00fc, 0x00fc, 0x00fc);
    794 
    795     /*
    796         0x10 - 0x1f is the alpha
    797         0x00 - 0x0e evens are the red
    798         0x01 - 0x0f odds are zero
    799     */
    800     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
    801         0x10, 0x00, 0x01, 0x01,
    802         0x10, 0x02, 0x01, 0x01,
    803         0x10, 0x04, 0x01, 0x01,
    804         0x10, 0x06, 0x01, 0x01
    805     );
    806     vector unsigned char vredalpha2 = (vector unsigned char)(
    807         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
    808     );
    809     /*
    810         0x00 - 0x0f is ARxx ARxx ARxx ARxx
    811         0x11 - 0x0f odds are blue
    812     */
    813     vector unsigned char vblue1 = VECUINT8_LITERAL(
    814         0x00, 0x01, 0x02, 0x11,
    815         0x04, 0x05, 0x06, 0x13,
    816         0x08, 0x09, 0x0a, 0x15,
    817         0x0c, 0x0d, 0x0e, 0x17
    818     );
    819     vector unsigned char vblue2 = (vector unsigned char)(
    820         vec_add((vector unsigned int)vblue1, v8_32)
    821     );
    822     /*
    823         0x00 - 0x0f is ARxB ARxB ARxB ARxB
    824         0x10 - 0x0e evens are green
    825     */
    826     vector unsigned char vgreen1 = VECUINT8_LITERAL(
    827         0x00, 0x01, 0x10, 0x03,
    828         0x04, 0x05, 0x12, 0x07,
    829         0x08, 0x09, 0x14, 0x0b,
    830         0x0c, 0x0d, 0x16, 0x0f
    831     );
    832     vector unsigned char vgreen2 = (vector unsigned char)(
    833         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
    834     );
    835     vector unsigned char vgmerge = VECUINT8_LITERAL(
    836         0x00, 0x02, 0x00, 0x06,
    837         0x00, 0x0a, 0x00, 0x0e,
    838         0x00, 0x12, 0x00, 0x16,
    839         0x00, 0x1a, 0x00, 0x1e);
    840     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
    841     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
    842     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
    843 
    844     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
    845     vf800 = vec_sl(vf800, vec_splat_u16(8));
    846 
    847     while(height--) {
    848         int extrawidth;
    849         vector unsigned char valigner;
    850         vector unsigned char vsrc;
    851         vector unsigned char voverflow;
    852         int width = info->d_width;
    853 
    854 #define ONE_PIXEL_BLEND(condition, widthvar) \
    855         while (condition) { \
    856             Uint32 Pixel; \
    857             unsigned sR, sG, sB, dR, dG, dB, sA; \
    858             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
    859             if(sA) { \
    860                 unsigned short dstpixel = *((unsigned short *)dst); \
    861                 dR = (dstpixel >> 8) & 0xf8; \
    862                 dG = (dstpixel >> 3) & 0xfc; \
    863                 dB = (dstpixel << 3) & 0xf8; \
    864                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
    865                 *((unsigned short *)dst) = ( \
    866                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
    867                 ); \
    868             } \
    869             src += 4; \
    870             dst += 2; \
    871             widthvar--; \
    872         }
    873         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
    874         extrawidth = (width % 8);
    875         valigner = VEC_ALIGNER(src);
    876         vsrc = (vector unsigned char)vec_ld(0, src);
    877         width -= extrawidth;
    878         while (width) {
    879             vector unsigned char valpha;
    880             vector unsigned char vsrc1, vsrc2;
    881             vector unsigned char vdst1, vdst2;
    882             vector unsigned short vR, vG, vB;
    883             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
    884 
    885             /* Load 8 pixels from src as ARGB */
    886             voverflow = (vector unsigned char)vec_ld(15, src);
    887             vsrc = vec_perm(vsrc, voverflow, valigner);
    888             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
    889             src += 16;
    890             vsrc = (vector unsigned char)vec_ld(15, src);
    891             voverflow = vec_perm(voverflow, vsrc, valigner);
    892             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
    893             src += 16;
    894 
    895             /* Load 8 pixels from dst as XRGB */
    896             voverflow = vec_ld(0, dst);
    897             vR = vec_and((vector unsigned short)voverflow, vf800);
    898             vB = vec_sl((vector unsigned short)voverflow, v3_16);
    899             vG = vec_sl(vB, v2_16);
    900             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
    901             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
    902             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
    903             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
    904             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
    905             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
    906 
    907             /* Alpha blend 8 pixels as ARGB */
    908             valpha = vec_perm(vsrc1, v0, valphaPermute);
    909             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
    910             valpha = vec_perm(vsrc2, v0, valphaPermute);
    911             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
    912 
    913             /* Convert 8 pixels to 565 */
    914             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
    915             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
    916             vgpixel = vec_and(vgpixel, vfc);
    917             vgpixel = vec_sl(vgpixel, v3_16);
    918             vrpixel = vec_sl(vpixel, v1_16);
    919             vrpixel = vec_and(vrpixel, vf800);
    920             vbpixel = vec_and(vpixel, v3f);
    921             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
    922             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
    923 
    924             /* Store 8 pixels */
    925             vec_st(vdst1, 0, dst);
    926 
    927             width -= 8;
    928             dst += 16;
    929         }
    930         ONE_PIXEL_BLEND((extrawidth), extrawidth);
    931 #undef ONE_PIXEL_BLEND
    932         src += srcskip;
    933         dst += dstskip;
    934     }
    935 }
    936 
    937 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
    938 {
    939     unsigned alpha = info->src->alpha;
    940     int height = info->d_height;
    941     Uint32 *srcp = (Uint32 *)info->s_pixels;
    942     int srcskip = info->s_skip >> 2;
    943     Uint32 *dstp = (Uint32 *)info->d_pixels;
    944     int dstskip = info->d_skip >> 2;
    945     SDL_PixelFormat *srcfmt = info->src;
    946     SDL_PixelFormat *dstfmt = info->dst;
    947     unsigned sA = srcfmt->alpha;
    948     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
    949     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
    950     Uint32 ckey = info->src->colorkey;
    951     vector unsigned char mergePermute;
    952     vector unsigned char vsrcPermute;
    953     vector unsigned char vdstPermute;
    954     vector unsigned char vsdstPermute;
    955     vector unsigned char valpha;
    956     vector unsigned char valphamask;
    957     vector unsigned char vbits;
    958     vector unsigned char v0;
    959     vector unsigned short v1;
    960     vector unsigned short v8;
    961     vector unsigned int vckey;
    962     vector unsigned int vrgbmask;
    963 
    964     mergePermute = VEC_MERGE_PERMUTE();
    965     v0 = vec_splat_u8(0);
    966     v1 = vec_splat_u16(1);
    967     v8 = vec_splat_u16(8);
    968 
    969     /* set the alpha to 255 on the destination surf */
    970     valphamask = VEC_ALPHA_MASK();
    971 
    972     vsrcPermute = calc_swizzle32(srcfmt, NULL);
    973     vdstPermute = calc_swizzle32(NULL, dstfmt);
    974     vsdstPermute = calc_swizzle32(dstfmt, NULL);
    975 
    976     /* set a vector full of alpha and 255-alpha */
    977     ((unsigned char *)&valpha)[0] = alpha;
    978     valpha = vec_splat(valpha, 0);
    979     vbits = (vector unsigned char)vec_splat_s8(-1);
    980 
    981     ckey &= rgbmask;
    982     ((unsigned int *)(char*)&vckey)[0] = ckey;
    983     vckey = vec_splat(vckey, 0);
    984     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
    985     vrgbmask = vec_splat(vrgbmask, 0);
    986 
    987     while(height--) {
    988         int width = info->d_width;
    989 #define ONE_PIXEL_BLEND(condition, widthvar) \
    990         while (condition) { \
    991             Uint32 Pixel; \
    992             unsigned sR, sG, sB, dR, dG, dB; \
    993             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
    994             if(sA && Pixel != ckey) { \
    995                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
    996                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
    997                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
    998                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
    999             } \
   1000             dstp++; \
   1001             srcp++; \
   1002             widthvar--; \
   1003         }
   1004         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1005         if (width > 0) {
   1006             int extrawidth = (width % 4);
   1007             vector unsigned char valigner = VEC_ALIGNER(srcp);
   1008             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   1009             width -= extrawidth;
   1010             while (width) {
   1011                 vector unsigned char vsel;
   1012                 vector unsigned char voverflow;
   1013                 vector unsigned char vd;
   1014                 vector unsigned char vd_orig;
   1015 
   1016                 /* s = *srcp */
   1017                 voverflow = (vector unsigned char)vec_ld(15, srcp);
   1018                 vs = vec_perm(vs, voverflow, valigner);
   1019 
   1020                 /* vsel is set for items that match the key */
   1021                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
   1022                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
   1023 
   1024                 /* permute to source format */
   1025                 vs = vec_perm(vs, valpha, vsrcPermute);
   1026 
   1027                 /* d = *dstp */
   1028                 vd = (vector unsigned char)vec_ld(0, dstp);
   1029                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
   1030 
   1031                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   1032 
   1033                 /* set the alpha channel to full on */
   1034                 vd = vec_or(vd, valphamask);
   1035 
   1036                 /* mask out color key */
   1037                 vd = vec_sel(vd, vd_orig, vsel);
   1038 
   1039                 /* permute to dest format */
   1040                 vd = vec_perm(vd, vbits, vdstPermute);
   1041 
   1042                 /* *dstp = res */
   1043                 vec_st((vector unsigned int)vd, 0, dstp);
   1044 
   1045                 srcp += 4;
   1046                 dstp += 4;
   1047                 width -= 4;
   1048                 vs = voverflow;
   1049             }
   1050             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1051         }
   1052 #undef ONE_PIXEL_BLEND
   1053 
   1054         srcp += srcskip;
   1055         dstp += dstskip;
   1056     }
   1057 }
   1058 
   1059 
   1060 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
   1061 {
   1062     int width = info->d_width;
   1063     int height = info->d_height;
   1064     Uint32 *srcp = (Uint32 *)info->s_pixels;
   1065     int srcskip = info->s_skip >> 2;
   1066     Uint32 *dstp = (Uint32 *)info->d_pixels;
   1067     int dstskip = info->d_skip >> 2;
   1068     SDL_PixelFormat *srcfmt = info->src;
   1069     SDL_PixelFormat *dstfmt = info->dst;
   1070     vector unsigned char mergePermute;
   1071     vector unsigned char valphaPermute;
   1072     vector unsigned char vsrcPermute;
   1073     vector unsigned char vdstPermute;
   1074     vector unsigned char vsdstPermute;
   1075     vector unsigned char valphamask;
   1076     vector unsigned char vpixelmask;
   1077     vector unsigned char v0;
   1078     vector unsigned short v1;
   1079     vector unsigned short v8;
   1080 
   1081     v0 = vec_splat_u8(0);
   1082     v1 = vec_splat_u16(1);
   1083     v8 = vec_splat_u16(8);
   1084     mergePermute = VEC_MERGE_PERMUTE();
   1085     valphamask = VEC_ALPHA_MASK();
   1086     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   1087     vpixelmask = vec_nor(valphamask, v0);
   1088     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   1089     vdstPermute = calc_swizzle32(NULL, dstfmt);
   1090     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   1091 
   1092 	while ( height-- ) {
   1093         width = info->d_width;
   1094 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   1095             Uint32 Pixel; \
   1096             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
   1097             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   1098             if(sA) { \
   1099               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
   1100               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   1101               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
   1102             } \
   1103             ++srcp; \
   1104             ++dstp; \
   1105             widthvar--; \
   1106         }
   1107         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1108         if (width > 0) {
   1109             /* vsrcPermute */
   1110             /* vdstPermute */
   1111             int extrawidth = (width % 4);
   1112             vector unsigned char valigner = VEC_ALIGNER(srcp);
   1113             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   1114             width -= extrawidth;
   1115             while (width) {
   1116                 vector unsigned char voverflow;
   1117                 vector unsigned char vd;
   1118                 vector unsigned char valpha;
   1119                 vector unsigned char vdstalpha;
   1120                 /* s = *srcp */
   1121                 voverflow = (vector unsigned char)vec_ld(15, srcp);
   1122                 vs = vec_perm(vs, voverflow, valigner);
   1123                 vs = vec_perm(vs, v0, vsrcPermute);
   1124 
   1125                 valpha = vec_perm(vs, v0, valphaPermute);
   1126 
   1127                 /* d = *dstp */
   1128                 vd = (vector unsigned char)vec_ld(0, dstp);
   1129                 vd = vec_perm(vd, v0, vsdstPermute);
   1130                 vdstalpha = vec_and(vd, valphamask);
   1131 
   1132                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   1133 
   1134                 /* set the alpha to the dest alpha */
   1135                 vd = vec_and(vd, vpixelmask);
   1136                 vd = vec_or(vd, vdstalpha);
   1137                 vd = vec_perm(vd, v0, vdstPermute);
   1138 
   1139                 /* *dstp = res */
   1140                 vec_st((vector unsigned int)vd, 0, dstp);
   1141 
   1142                 srcp += 4;
   1143                 dstp += 4;
   1144                 width -= 4;
   1145                 vs = voverflow;
   1146 
   1147             }
   1148             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1149         }
   1150 	    srcp += srcskip;
   1151 	    dstp += dstskip;
   1152 #undef ONE_PIXEL_BLEND
   1153 	}
   1154 }
   1155 
   1156 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   1157 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
   1158 {
   1159 	int width = info->d_width;
   1160 	int height = info->d_height;
   1161 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   1162 	int srcskip = info->s_skip >> 2;
   1163 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   1164 	int dstskip = info->d_skip >> 2;
   1165     vector unsigned char mergePermute;
   1166     vector unsigned char valphaPermute;
   1167     vector unsigned char valphamask;
   1168     vector unsigned char vpixelmask;
   1169     vector unsigned char v0;
   1170     vector unsigned short v1;
   1171     vector unsigned short v8;
   1172     v0 = vec_splat_u8(0);
   1173     v1 = vec_splat_u16(1);
   1174     v8 = vec_splat_u16(8);
   1175     mergePermute = VEC_MERGE_PERMUTE();
   1176     valphamask = VEC_ALPHA_MASK();
   1177     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   1178 
   1179 
   1180     vpixelmask = vec_nor(valphamask, v0);
   1181 	while(height--) {
   1182         width = info->d_width;
   1183 #define ONE_PIXEL_BLEND(condition, widthvar) \
   1184         while ((condition)) { \
   1185             Uint32 dalpha; \
   1186             Uint32 d; \
   1187             Uint32 s1; \
   1188             Uint32 d1; \
   1189             Uint32 s = *srcp; \
   1190             Uint32 alpha = s >> 24; \
   1191             if(alpha) { \
   1192               if(alpha == SDL_ALPHA_OPAQUE) { \
   1193                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
   1194               } else { \
   1195                 d = *dstp; \
   1196                 dalpha = d & 0xff000000; \
   1197                 s1 = s & 0xff00ff; \
   1198                 d1 = d & 0xff00ff; \
   1199                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
   1200                 s &= 0xff00; \
   1201                 d &= 0xff00; \
   1202                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
   1203                 *dstp = d1 | d | dalpha; \
   1204               } \
   1205             } \
   1206             ++srcp; \
   1207             ++dstp; \
   1208             widthvar--; \
   1209 	    }
   1210         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1211         if (width > 0) {
   1212             int extrawidth = (width % 4);
   1213             vector unsigned char valigner = VEC_ALIGNER(srcp);
   1214             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   1215             width -= extrawidth;
   1216             while (width) {
   1217                 vector unsigned char voverflow;
   1218                 vector unsigned char vd;
   1219                 vector unsigned char valpha;
   1220                 vector unsigned char vdstalpha;
   1221                 /* s = *srcp */
   1222                 voverflow = (vector unsigned char)vec_ld(15, srcp);
   1223                 vs = vec_perm(vs, voverflow, valigner);
   1224 
   1225                 valpha = vec_perm(vs, v0, valphaPermute);
   1226 
   1227                 /* d = *dstp */
   1228                 vd = (vector unsigned char)vec_ld(0, dstp);
   1229                 vdstalpha = vec_and(vd, valphamask);
   1230 
   1231                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   1232 
   1233                 /* set the alpha to the dest alpha */
   1234                 vd = vec_and(vd, vpixelmask);
   1235                 vd = vec_or(vd, vdstalpha);
   1236 
   1237                 /* *dstp = res */
   1238                 vec_st((vector unsigned int)vd, 0, dstp);
   1239 
   1240                 srcp += 4;
   1241                 dstp += 4;
   1242                 width -= 4;
   1243                 vs = voverflow;
   1244             }
   1245             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1246         }
   1247 	    srcp += srcskip;
   1248 	    dstp += dstskip;
   1249 	}
   1250 #undef ONE_PIXEL_BLEND
   1251 }
   1252 
   1253 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
   1254 {
   1255     /* XXX : 6 */
   1256 	unsigned alpha = info->src->alpha;
   1257     int height = info->d_height;
   1258     Uint32 *srcp = (Uint32 *)info->s_pixels;
   1259     int srcskip = info->s_skip >> 2;
   1260     Uint32 *dstp = (Uint32 *)info->d_pixels;
   1261     int dstskip = info->d_skip >> 2;
   1262     SDL_PixelFormat *srcfmt = info->src;
   1263     SDL_PixelFormat *dstfmt = info->dst;
   1264 	unsigned sA = srcfmt->alpha;
   1265 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   1266     vector unsigned char mergePermute;
   1267     vector unsigned char vsrcPermute;
   1268     vector unsigned char vdstPermute;
   1269     vector unsigned char vsdstPermute;
   1270     vector unsigned char valpha;
   1271     vector unsigned char valphamask;
   1272     vector unsigned char vbits;
   1273     vector unsigned short v1;
   1274     vector unsigned short v8;
   1275 
   1276     mergePermute = VEC_MERGE_PERMUTE();
   1277     v1 = vec_splat_u16(1);
   1278     v8 = vec_splat_u16(8);
   1279 
   1280     /* set the alpha to 255 on the destination surf */
   1281     valphamask = VEC_ALPHA_MASK();
   1282 
   1283     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   1284     vdstPermute = calc_swizzle32(NULL, dstfmt);
   1285     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   1286 
   1287     /* set a vector full of alpha and 255-alpha */
   1288     ((unsigned char *)&valpha)[0] = alpha;
   1289     valpha = vec_splat(valpha, 0);
   1290     vbits = (vector unsigned char)vec_splat_s8(-1);
   1291 
   1292     while(height--) {
   1293         int width = info->d_width;
   1294 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   1295             Uint32 Pixel; \
   1296             unsigned sR, sG, sB, dR, dG, dB; \
   1297             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
   1298             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   1299             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   1300             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   1301             ++srcp; \
   1302             ++dstp; \
   1303             widthvar--; \
   1304         }
   1305         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1306         if (width > 0) {
   1307             int extrawidth = (width % 4);
   1308             vector unsigned char valigner = VEC_ALIGNER(srcp);
   1309             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   1310             width -= extrawidth;
   1311             while (width) {
   1312                 vector unsigned char voverflow;
   1313                 vector unsigned char vd;
   1314 
   1315                 /* s = *srcp */
   1316                 voverflow = (vector unsigned char)vec_ld(15, srcp);
   1317                 vs = vec_perm(vs, voverflow, valigner);
   1318                 vs = vec_perm(vs, valpha, vsrcPermute);
   1319 
   1320                 /* d = *dstp */
   1321                 vd = (vector unsigned char)vec_ld(0, dstp);
   1322                 vd = vec_perm(vd, vd, vsdstPermute);
   1323 
   1324                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   1325 
   1326                 /* set the alpha channel to full on */
   1327                 vd = vec_or(vd, valphamask);
   1328                 vd = vec_perm(vd, vbits, vdstPermute);
   1329 
   1330                 /* *dstp = res */
   1331                 vec_st((vector unsigned int)vd, 0, dstp);
   1332 
   1333                 srcp += 4;
   1334                 dstp += 4;
   1335                 width -= 4;
   1336                 vs = voverflow;
   1337             }
   1338             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1339         }
   1340 #undef ONE_PIXEL_BLEND
   1341 
   1342         srcp += srcskip;
   1343         dstp += dstskip;
   1344     }
   1345 
   1346 }
   1347 
   1348 
   1349 /* fast RGB888->(A)RGB888 blending */
   1350 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
   1351 {
   1352 	unsigned alpha = info->src->alpha;
   1353     int height = info->d_height;
   1354     Uint32 *srcp = (Uint32 *)info->s_pixels;
   1355     int srcskip = info->s_skip >> 2;
   1356     Uint32 *dstp = (Uint32 *)info->d_pixels;
   1357     int dstskip = info->d_skip >> 2;
   1358     vector unsigned char mergePermute;
   1359     vector unsigned char valpha;
   1360     vector unsigned char valphamask;
   1361     vector unsigned short v1;
   1362     vector unsigned short v8;
   1363 
   1364     mergePermute = VEC_MERGE_PERMUTE();
   1365     v1 = vec_splat_u16(1);
   1366     v8 = vec_splat_u16(8);
   1367 
   1368     /* set the alpha to 255 on the destination surf */
   1369     valphamask = VEC_ALPHA_MASK();
   1370 
   1371     /* set a vector full of alpha and 255-alpha */
   1372     ((unsigned char *)&valpha)[0] = alpha;
   1373     valpha = vec_splat(valpha, 0);
   1374 
   1375     while(height--) {
   1376         int width = info->d_width;
   1377 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   1378             Uint32 s = *srcp; \
   1379             Uint32 d = *dstp; \
   1380             Uint32 s1 = s & 0xff00ff; \
   1381             Uint32 d1 = d & 0xff00ff; \
   1382             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
   1383                  & 0xff00ff; \
   1384             s &= 0xff00; \
   1385             d &= 0xff00; \
   1386             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
   1387             *dstp = d1 | d | 0xff000000; \
   1388             ++srcp; \
   1389             ++dstp; \
   1390             widthvar--; \
   1391         }
   1392         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1393         if (width > 0) {
   1394             int extrawidth = (width % 4);
   1395             vector unsigned char valigner = VEC_ALIGNER(srcp);
   1396             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   1397             width -= extrawidth;
   1398             while (width) {
   1399                 vector unsigned char voverflow;
   1400                 vector unsigned char vd;
   1401 
   1402                 /* s = *srcp */
   1403                 voverflow = (vector unsigned char)vec_ld(15, srcp);
   1404                 vs = vec_perm(vs, voverflow, valigner);
   1405 
   1406                 /* d = *dstp */
   1407                 vd = (vector unsigned char)vec_ld(0, dstp);
   1408 
   1409                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   1410 
   1411                 /* set the alpha channel to full on */
   1412                 vd = vec_or(vd, valphamask);
   1413 
   1414                 /* *dstp = res */
   1415                 vec_st((vector unsigned int)vd, 0, dstp);
   1416 
   1417                 srcp += 4;
   1418                 dstp += 4;
   1419                 width -= 4;
   1420                 vs = voverflow;
   1421             }
   1422             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1423         }
   1424 #undef ONE_PIXEL_BLEND
   1425 
   1426         srcp += srcskip;
   1427         dstp += dstskip;
   1428     }
   1429 }
   1430 #if __MWERKS__
   1431 #pragma altivec_model off
   1432 #endif
   1433 #endif /* SDL_ALTIVEC_BLITTERS */
   1434 
   1435 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   1436 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
   1437 {
   1438 	int width = info->d_width;
   1439 	int height = info->d_height;
   1440 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   1441 	int srcskip = info->s_skip >> 2;
   1442 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   1443 	int dstskip = info->d_skip >> 2;
   1444 
   1445 	while(height--) {
   1446 	    DUFFS_LOOP4({
   1447 		    Uint32 s = *srcp++;
   1448 		    Uint32 d = *dstp;
   1449 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   1450 			       + (s & d & 0x00010101)) | 0xff000000;
   1451 	    }, width);
   1452 	    srcp += srcskip;
   1453 	    dstp += dstskip;
   1454 	}
   1455 }
   1456 
   1457 /* fast RGB888->(A)RGB888 blending with surface alpha */
   1458 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
   1459 {
   1460 	unsigned alpha = info->src->alpha;
   1461 	if(alpha == 128) {
   1462 		BlitRGBtoRGBSurfaceAlpha128(info);
   1463 	} else {
   1464 		int width = info->d_width;
   1465 		int height = info->d_height;
   1466 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   1467 		int srcskip = info->s_skip >> 2;
   1468 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   1469 		int dstskip = info->d_skip >> 2;
   1470 		Uint32 s;
   1471 		Uint32 d;
   1472 		Uint32 s1;
   1473 		Uint32 d1;
   1474 
   1475 		while(height--) {
   1476 			DUFFS_LOOP_DOUBLE2({
   1477 				/* One Pixel Blend */
   1478 				s = *srcp;
   1479 				d = *dstp;
   1480 				s1 = s & 0xff00ff;
   1481 				d1 = d & 0xff00ff;
   1482 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
   1483 				     & 0xff00ff;
   1484 				s &= 0xff00;
   1485 				d &= 0xff00;
   1486 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   1487 				*dstp = d1 | d | 0xff000000;
   1488 				++srcp;
   1489 				++dstp;
   1490 			},{
   1491 			        /* Two Pixels Blend */
   1492 				s = *srcp;
   1493 				d = *dstp;
   1494 				s1 = s & 0xff00ff;
   1495 				d1 = d & 0xff00ff;
   1496 				d1 += (s1 - d1) * alpha >> 8;
   1497 				d1 &= 0xff00ff;
   1498 
   1499 				s = ((s & 0xff00) >> 8) |
   1500 					((srcp[1] & 0xff00) << 8);
   1501 				d = ((d & 0xff00) >> 8) |
   1502 					((dstp[1] & 0xff00) << 8);
   1503 				d += (s - d) * alpha >> 8;
   1504 				d &= 0x00ff00ff;
   1505 
   1506 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
   1507 				++srcp;
   1508 
   1509 			        s1 = *srcp;
   1510 				d1 = *dstp;
   1511 				s1 &= 0xff00ff;
   1512 				d1 &= 0xff00ff;
   1513 				d1 += (s1 - d1) * alpha >> 8;
   1514 				d1 &= 0xff00ff;
   1515 
   1516 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
   1517 				++srcp;
   1518 				++dstp;
   1519 			}, width);
   1520 			srcp += srcskip;
   1521 			dstp += dstskip;
   1522 		}
   1523 	}
   1524 }
   1525 
   1526 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   1527 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
   1528 {
   1529 	int width = info->d_width;
   1530 	int height = info->d_height;
   1531 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   1532 	int srcskip = info->s_skip >> 2;
   1533 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   1534 	int dstskip = info->d_skip >> 2;
   1535 
   1536 	while(height--) {
   1537 	    DUFFS_LOOP4({
   1538 		Uint32 dalpha;
   1539 		Uint32 d;
   1540 		Uint32 s1;
   1541 		Uint32 d1;
   1542 		Uint32 s = *srcp;
   1543 		Uint32 alpha = s >> 24;
   1544 		/* FIXME: Here we special-case opaque alpha since the
   1545 		   compositioning used (>>8 instead of /255) doesn't handle
   1546 		   it correctly. Also special-case alpha=0 for speed?
   1547 		   Benchmark this! */
   1548 		if(alpha) {
   1549 		  if(alpha == SDL_ALPHA_OPAQUE) {
   1550 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
   1551 		  } else {
   1552 		    /*
   1553 		     * take out the middle component (green), and process
   1554 		     * the other two in parallel. One multiply less.
   1555 		     */
   1556 		    d = *dstp;
   1557 		    dalpha = d & 0xff000000;
   1558 		    s1 = s & 0xff00ff;
   1559 		    d1 = d & 0xff00ff;
   1560 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
   1561 		    s &= 0xff00;
   1562 		    d &= 0xff00;
   1563 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   1564 		    *dstp = d1 | d | dalpha;
   1565 		  }
   1566 		}
   1567 		++srcp;
   1568 		++dstp;
   1569 	    }, width);
   1570 	    srcp += srcskip;
   1571 	    dstp += dstskip;
   1572 	}
   1573 }
   1574 
   1575 #if GCC_ASMBLIT
   1576 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   1577 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
   1578 {
   1579 	int width = info->d_width;
   1580 	int height = info->d_height;
   1581 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   1582 	int srcskip = info->s_skip >> 2;
   1583 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   1584 	int dstskip = info->d_skip >> 2;
   1585 	SDL_PixelFormat* sf = info->src;
   1586 	Uint32 amask = sf->Amask;
   1587 
   1588 	__asm__ (
   1589 	/* make mm6 all zeros. */
   1590 	"pxor       %%mm6, %%mm6\n"
   1591 
   1592 	/* Make a mask to preserve the alpha. */
   1593 	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
   1594 	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
   1595 	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
   1596 	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
   1597 	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
   1598 
   1599 	/* form channel masks */
   1600 	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
   1601 	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
   1602 	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
   1603 	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
   1604 
   1605 	/* get alpha channel shift */
   1606 	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
   1607 
   1608 	  : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
   1609 
   1610 	while(height--) {
   1611 
   1612 	    DUFFS_LOOP4({
   1613 		Uint32 alpha;
   1614 
   1615 		__asm__ (
   1616 		"prefetch 64(%0)\n"
   1617 		"prefetch 64(%1)\n"
   1618 			: : "r" (srcp), "r" (dstp) );
   1619 
   1620 		alpha = *srcp & amask;
   1621 		/* FIXME: Here we special-case opaque alpha since the
   1622 		   compositioning used (>>8 instead of /255) doesn't handle
   1623 		   it correctly. Also special-case alpha=0 for speed?
   1624 		   Benchmark this! */
   1625 		if(alpha == 0) {
   1626 		    /* do nothing */
   1627 		}
   1628 		else if(alpha == amask) {
   1629 			/* opaque alpha -- copy RGB, keep dst alpha */
   1630 		    /* using MMX here to free up regular registers for other things */
   1631 			    __asm__ (
   1632 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
   1633 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
   1634 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
   1635 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
   1636 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
   1637 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
   1638 
   1639 		     : : "r" (srcp), "r" (dstp) );
   1640 		}
   1641 
   1642 		else {
   1643 			    __asm__ (
   1644 		    /* load in the source, and dst. */
   1645 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
   1646 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
   1647 
   1648 		    /* Move the src alpha into mm2 */
   1649 
   1650 		    /* if supporting pshufw */
   1651 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
   1652 		    /*"psrlw     $8, %%mm2\n" */
   1653 
   1654 		    /* else: */
   1655 		    "movd       %2,    %%mm2\n"
   1656 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
   1657 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
   1658 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
   1659 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
   1660 
   1661 		    /* move the colors into words. */
   1662 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
   1663 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
   1664 
   1665 		    /* src - dst */
   1666 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
   1667 
   1668 		    /* A * (src-dst) */
   1669 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
   1670 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
   1671 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
   1672 
   1673 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
   1674 
   1675 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
   1676 
   1677 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
   1678 
   1679 		}
   1680 		++srcp;
   1681 		++dstp;
   1682 	    }, width);
   1683 	    srcp += srcskip;
   1684 	    dstp += dstskip;
   1685 	}
   1686 
   1687 	__asm__ (
   1688 	"emms\n"
   1689 		:   );
   1690 }
   1691 /* End GCC_ASMBLIT*/
   1692 
   1693 #elif MSVC_ASMBLIT
   1694 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   1695 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
   1696 {
   1697 	int width = info->d_width;
   1698 	int height = info->d_height;
   1699 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   1700 	int srcskip = info->s_skip >> 2;
   1701 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   1702 	int dstskip = info->d_skip >> 2;
   1703 	SDL_PixelFormat* sf = info->src;
   1704 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   1705 	Uint32 amask = sf->Amask;
   1706 	Uint32 ashift = sf->Ashift;
   1707 	Uint64 multmask;
   1708 
   1709 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
   1710 
   1711 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   1712 	multmask = ~(0xFFFFi64 << (ashift * 2));
   1713 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
   1714 
   1715 	while(height--) {
   1716 	    DUFFS_LOOP4({
   1717 		Uint32 alpha;
   1718 
   1719 		_m_prefetch(srcp + 16);
   1720 		_m_prefetch(dstp + 16);
   1721 
   1722 		alpha = *srcp & amask;
   1723 		if (alpha == 0) {
   1724 			/* do nothing */
   1725 		} else if (alpha == amask) {
   1726 			/* copy RGB, keep dst alpha */
   1727 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   1728 		} else {
   1729 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   1730 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   1731 
   1732 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   1733 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   1734 
   1735 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   1736 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   1737 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   1738 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   1739 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   1740 
   1741 			/* blend */
   1742 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
   1743 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
   1744 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   1745 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
   1746 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   1747 
   1748 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   1749 		}
   1750 		++srcp;
   1751 		++dstp;
   1752 	    }, width);
   1753 	    srcp += srcskip;
   1754 	    dstp += dstskip;
   1755 	}
   1756 	_mm_empty();
   1757 }
   1758 /* End MSVC_ASMBLIT */
   1759 
   1760 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   1761 
   1762 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
   1763 
   1764 /* blend a single 16 bit pixel at 50% */
   1765 #define BLEND16_50(d, s, mask)						\
   1766 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
   1767 
   1768 /* blend two 16 bit pixels at 50% */
   1769 #define BLEND2x16_50(d, s, mask)					     \
   1770 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
   1771 	 + (s & d & (~(mask | mask << 16))))
   1772 
   1773 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
   1774 {
   1775 	int width = info->d_width;
   1776 	int height = info->d_height;
   1777 	Uint16 *srcp = (Uint16 *)info->s_pixels;
   1778 	int srcskip = info->s_skip >> 1;
   1779 	Uint16 *dstp = (Uint16 *)info->d_pixels;
   1780 	int dstskip = info->d_skip >> 1;
   1781 
   1782 	while(height--) {
   1783 		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
   1784 			/*
   1785 			 * Source and destination not aligned, pipeline it.
   1786 			 * This is mostly a win for big blits but no loss for
   1787 			 * small ones
   1788 			 */
   1789 			Uint32 prev_sw;
   1790 			int w = width;
   1791 
   1792 			/* handle odd destination */
   1793 			if((uintptr_t)dstp & 2) {
   1794 				Uint16 d = *dstp, s = *srcp;
   1795 				*dstp = BLEND16_50(d, s, mask);
   1796 				dstp++;
   1797 				srcp++;
   1798 				w--;
   1799 			}
   1800 			srcp++;	/* srcp is now 32-bit aligned */
   1801 
   1802 			/* bootstrap pipeline with first halfword */
   1803 			prev_sw = ((Uint32 *)srcp)[-1];
   1804 
   1805 			while(w > 1) {
   1806 				Uint32 sw, dw, s;
   1807 				sw = *(Uint32 *)srcp;
   1808 				dw = *(Uint32 *)dstp;
   1809 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   1810 				s = (prev_sw << 16) + (sw >> 16);
   1811 #else
   1812 				s = (prev_sw >> 16) + (sw << 16);
   1813 #endif
   1814 				prev_sw = sw;
   1815 				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
   1816 				dstp += 2;
   1817 				srcp += 2;
   1818 				w -= 2;
   1819 			}
   1820 
   1821 			/* final pixel if any */
   1822 			if(w) {
   1823 				Uint16 d = *dstp, s;
   1824 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   1825 				s = (Uint16)prev_sw;
   1826 #else
   1827 				s = (Uint16)(prev_sw >> 16);
   1828 #endif
   1829 				*dstp = BLEND16_50(d, s, mask);
   1830 				srcp++;
   1831 				dstp++;
   1832 			}
   1833 			srcp += srcskip - 1;
   1834 			dstp += dstskip;
   1835 		} else {
   1836 			/* source and destination are aligned */
   1837 			int w = width;
   1838 
   1839 			/* first odd pixel? */
   1840 			if((uintptr_t)srcp & 2) {
   1841 				Uint16 d = *dstp, s = *srcp;
   1842 				*dstp = BLEND16_50(d, s, mask);
   1843 				srcp++;
   1844 				dstp++;
   1845 				w--;
   1846 			}
   1847 			/* srcp and dstp are now 32-bit aligned */
   1848 
   1849 			while(w > 1) {
   1850 				Uint32 sw = *(Uint32 *)srcp;
   1851 				Uint32 dw = *(Uint32 *)dstp;
   1852 				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
   1853 				srcp += 2;
   1854 				dstp += 2;
   1855 				w -= 2;
   1856 			}
   1857 
   1858 			/* last odd pixel? */
   1859 			if(w) {
   1860 				Uint16 d = *dstp, s = *srcp;
   1861 				*dstp = BLEND16_50(d, s, mask);
   1862 				srcp++;
   1863 				dstp++;
   1864 			}
   1865 			srcp += srcskip;
   1866 			dstp += dstskip;
   1867 		}
   1868 	}
   1869 }
   1870 
   1871 #if GCC_ASMBLIT
   1872 /* fast RGB565->RGB565 blending with surface alpha */
   1873 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
   1874 {
   1875 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
   1876 	if(alpha == 128) {
   1877 		Blit16to16SurfaceAlpha128(info, 0xf7de);
   1878 	} else {
   1879 		int width = info->d_width;
   1880 		int height = info->d_height;
   1881 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   1882 		int srcskip = info->s_skip >> 1;
   1883 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   1884 		int dstskip = info->d_skip >> 1;
   1885 		Uint32 s, d;
   1886 		Uint8 load[8];
   1887 
   1888 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
   1889 		*(Uint64 *)load = alpha;
   1890 		alpha >>= 3;		/* downscale alpha to 5 bits */
   1891 
   1892 		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
   1893 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
   1894 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
   1895 		/* position alpha to allow for mullo and mulhi on diff channels
   1896 		   to reduce the number of operations */
   1897 		psllq_i2r(3, mm0);
   1898 
   1899 		/* Setup the 565 color channel masks */
   1900 		*(Uint64 *)load = 0x07E007E007E007E0ULL;
   1901 		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
   1902 		*(Uint64 *)load = 0x001F001F001F001FULL;
   1903 		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
   1904 		while(height--) {
   1905 			DUFFS_LOOP_QUATRO2(
   1906 			{
   1907 				s = *srcp++;
   1908 				d = *dstp;
   1909 				/*
   1910 				 * shift out the middle component (green) to
   1911 				 * the high 16 bits, and process all three RGB
   1912 				 * components at the same time.
   1913 				 */
   1914 				s = (s | s << 16) & 0x07e0f81f;
   1915 				d = (d | d << 16) & 0x07e0f81f;
   1916 				d += (s - d) * alpha >> 5;
   1917 				d &= 0x07e0f81f;
   1918 				*dstp++ = d | d >> 16;
   1919 			},{
   1920 				s = *srcp++;
   1921 				d = *dstp;
   1922 				/*
   1923 				 * shift out the middle component (green) to
   1924 				 * the high 16 bits, and process all three RGB
   1925 				 * components at the same time.
   1926 				 */
   1927 				s = (s | s << 16) & 0x07e0f81f;
   1928 				d = (d | d << 16) & 0x07e0f81f;
   1929 				d += (s - d) * alpha >> 5;
   1930 				d &= 0x07e0f81f;
   1931 				*dstp++ = d | d >> 16;
   1932 				s = *srcp++;
   1933 				d = *dstp;
   1934 				/*
   1935 				 * shift out the middle component (green) to
   1936 				 * the high 16 bits, and process all three RGB
   1937 				 * components at the same time.
   1938 				 */
   1939 				s = (s | s << 16) & 0x07e0f81f;
   1940 				d = (d | d << 16) & 0x07e0f81f;
   1941 				d += (s - d) * alpha >> 5;
   1942 				d &= 0x07e0f81f;
   1943 				*dstp++ = d | d >> 16;
   1944 			},{
   1945 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
   1946 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
   1947 
   1948 				/* red -- does not need a mask since the right shift clears
   1949 				   the uninteresting bits */
   1950 				movq_r2r(mm2, mm5); /* src -> mm5 */
   1951 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1952 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
   1953 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
   1954 
   1955 				/* blend */
   1956 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1957 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1958 				/* alpha used is actually 11 bits
   1959 				   11 + 5 = 16 bits, so the sign bits are lost */
   1960 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   1961 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1962 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
   1963 
   1964 				movq_r2r(mm6, mm1); /* save new reds in dsts */
   1965 
   1966 				/* green -- process the bits in place */
   1967 				movq_r2r(mm2, mm5); /* src -> mm5 */
   1968 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1969 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
   1970 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
   1971 
   1972 				/* blend */
   1973 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1974 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1975 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
   1976 				   bits are gone and the sign bits present */
   1977 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   1978 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1979 
   1980 				por_r2r(mm6, mm1); /* save new greens in dsts */
   1981 
   1982 				/* blue */
   1983 				movq_r2r(mm2, mm5); /* src -> mm5 */
   1984 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1985 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
   1986 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
   1987 
   1988 				/* blend */
   1989 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1990 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1991 				/* 11 + 5 = 16 bits, so the sign bits are lost and
   1992 				   the interesting bits will need to be MASKed */
   1993 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   1994 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1995 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
   1996 
   1997 				por_r2r(mm6, mm1); /* save new blues in dsts */
   1998 
   1999 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
   2000 
   2001 				srcp += 4;
   2002 				dstp += 4;
   2003 			}, width);
   2004 			srcp += srcskip;
   2005 			dstp += dstskip;
   2006 		}
   2007 		emms();
   2008 	}
   2009 }
   2010 
   2011 /* fast RGB555->RGB555 blending with surface alpha */
   2012 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
   2013 {
   2014 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
   2015 	if(alpha == 128) {
   2016 		Blit16to16SurfaceAlpha128(info, 0xfbde);
   2017 	} else {
   2018 		int width = info->d_width;
   2019 		int height = info->d_height;
   2020 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   2021 		int srcskip = info->s_skip >> 1;
   2022 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   2023 		int dstskip = info->d_skip >> 1;
   2024 		Uint32 s, d;
   2025 		Uint8 load[8];
   2026 
   2027 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
   2028 		*(Uint64 *)load = alpha;
   2029 		alpha >>= 3;		/* downscale alpha to 5 bits */
   2030 
   2031 		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
   2032 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
   2033 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
   2034 		/* position alpha to allow for mullo and mulhi on diff channels
   2035 		   to reduce the number of operations */
   2036 		psllq_i2r(3, mm0);
   2037 
   2038 		/* Setup the 555 color channel masks */
   2039 		*(Uint64 *)load = 0x03E003E003E003E0ULL;
   2040 		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
   2041 		*(Uint64 *)load = 0x001F001F001F001FULL;
   2042 		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
   2043 		while(height--) {
   2044 			DUFFS_LOOP_QUATRO2(
   2045 			{
   2046 				s = *srcp++;
   2047 				d = *dstp;
   2048 				/*
   2049 				 * shift out the middle component (green) to
   2050 				 * the high 16 bits, and process all three RGB
   2051 				 * components at the same time.
   2052 				 */
   2053 				s = (s | s << 16) & 0x03e07c1f;
   2054 				d = (d | d << 16) & 0x03e07c1f;
   2055 				d += (s - d) * alpha >> 5;
   2056 				d &= 0x03e07c1f;
   2057 				*dstp++ = d | d >> 16;
   2058 			},{
   2059 				s = *srcp++;
   2060 				d = *dstp;
   2061 				/*
   2062 				 * shift out the middle component (green) to
   2063 				 * the high 16 bits, and process all three RGB
   2064 				 * components at the same time.
   2065 				 */
   2066 				s = (s | s << 16) & 0x03e07c1f;
   2067 				d = (d | d << 16) & 0x03e07c1f;
   2068 				d += (s - d) * alpha >> 5;
   2069 				d &= 0x03e07c1f;
   2070 				*dstp++ = d | d >> 16;
   2071 			        s = *srcp++;
   2072 				d = *dstp;
   2073 				/*
   2074 				 * shift out the middle component (green) to
   2075 				 * the high 16 bits, and process all three RGB
   2076 				 * components at the same time.
   2077 				 */
   2078 				s = (s | s << 16) & 0x03e07c1f;
   2079 				d = (d | d << 16) & 0x03e07c1f;
   2080 				d += (s - d) * alpha >> 5;
   2081 				d &= 0x03e07c1f;
   2082 				*dstp++ = d | d >> 16;
   2083 			},{
   2084 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
   2085 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
   2086 
   2087 				/* red -- process the bits in place */
   2088 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
   2089 					/* by reusing the GREEN mask we free up another mmx
   2090 					   register to accumulate the result */
   2091 
   2092 				movq_r2r(mm2, mm5); /* src -> mm5 */
   2093 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2094 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
   2095 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
   2096 
   2097 				/* blend */
   2098 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2099 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2100 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
   2101 				   cleared by a MASK below */
   2102 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   2103 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2104 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
   2105 
   2106 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
   2107 
   2108 				movq_r2r(mm6, mm1); /* save new reds in dsts */
   2109 
   2110 				/* green -- process the bits in place */
   2111 				movq_r2r(mm2, mm5); /* src -> mm5 */
   2112 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2113 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
   2114 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
   2115 
   2116 				/* blend */
   2117 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2118 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2119 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
   2120 				   bits are gone and the sign bits present */
   2121 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   2122 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2123 
   2124 				por_r2r(mm6, mm1); /* save new greens in dsts */
   2125 
   2126 				/* blue */
   2127 				movq_r2r(mm2, mm5); /* src -> mm5 */
   2128 				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2129 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
   2130 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
   2131 
   2132 				/* blend */
   2133 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2134 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2135 				/* 11 + 5 = 16 bits, so the sign bits are lost and
   2136 				   the interesting bits will need to be MASKed */
   2137 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   2138 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2139 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
   2140 
   2141 				por_r2r(mm6, mm1); /* save new blues in dsts */
   2142 
   2143 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
   2144 
   2145 				srcp += 4;
   2146 				dstp += 4;
   2147 			}, width);
   2148 			srcp += srcskip;
   2149 			dstp += dstskip;
   2150 		}
   2151 		emms();
   2152 	}
   2153 }
   2154 /* End GCC_ASMBLIT */
   2155 
   2156 #elif MSVC_ASMBLIT
   2157 /* fast RGB565->RGB565 blending with surface alpha */
   2158 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
   2159 {
   2160 	unsigned alpha = info->src->alpha;
   2161 	if(alpha == 128) {
   2162 		Blit16to16SurfaceAlpha128(info, 0xf7de);
   2163 	} else {
   2164 		int width = info->d_width;
   2165 		int height = info->d_height;
   2166 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   2167 		int srcskip = info->s_skip >> 1;
   2168 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   2169 		int dstskip = info->d_skip >> 1;
   2170 		Uint32 s, d;
   2171 
   2172 		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
   2173 
   2174 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
   2175 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
   2176 		alpha >>= 3;		/* downscale alpha to 5 bits */
   2177 
   2178 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   2179 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   2180 		/* position alpha to allow for mullo and mulhi on diff channels
   2181 		   to reduce the number of operations */
   2182 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
   2183 
   2184 		/* Setup the 565 color channel masks */
   2185 		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
   2186 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
   2187 
   2188 		while(height--) {
   2189 			DUFFS_LOOP_QUATRO2(
   2190 			{
   2191 				s = *srcp++;
   2192 				d = *dstp;
   2193 				/*
   2194 				 * shift out the middle component (green) to
   2195 				 * the high 16 bits, and process all three RGB
   2196 				 * components at the same time.
   2197 				 */
   2198 				s = (s | s << 16) & 0x07e0f81f;
   2199 				d = (d | d << 16) & 0x07e0f81f;
   2200 				d += (s - d) * alpha >> 5;
   2201 				d &= 0x07e0f81f;
   2202 				*dstp++ = (Uint16)(d | d >> 16);
   2203 			},{
   2204 				s = *srcp++;
   2205 				d = *dstp;
   2206 				/*
   2207 				 * shift out the middle component (green) to
   2208 				 * the high 16 bits, and process all three RGB
   2209 				 * components at the same time.
   2210 				 */
   2211 				s = (s | s << 16) & 0x07e0f81f;
   2212 				d = (d | d << 16) & 0x07e0f81f;
   2213 				d += (s - d) * alpha >> 5;
   2214 				d &= 0x07e0f81f;
   2215 				*dstp++ = (Uint16)(d | d >> 16);
   2216 				s = *srcp++;
   2217 				d = *dstp;
   2218 				/*
   2219 				 * shift out the middle component (green) to
   2220 				 * the high 16 bits, and process all three RGB
   2221 				 * components at the same time.
   2222 				 */
   2223 				s = (s | s << 16) & 0x07e0f81f;
   2224 				d = (d | d << 16) & 0x07e0f81f;
   2225 				d += (s - d) * alpha >> 5;
   2226 				d &= 0x07e0f81f;
   2227 				*dstp++ = (Uint16)(d | d >> 16);
   2228 			},{
   2229 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   2230 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   2231 
   2232 				/* red */
   2233 				src2 = src1;
   2234 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
   2235 
   2236 				dst2 = dst1;
   2237 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
   2238 
   2239 				/* blend */
   2240 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2241 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2242 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   2243 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2244 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
   2245 
   2246 				mm_res = dst2; /* RED -> mm_res */
   2247 
   2248 				/* green -- process the bits in place */
   2249 				src2 = src1;
   2250 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   2251 
   2252 				dst2 = dst1;
   2253 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   2254 
   2255 				/* blend */
   2256 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2257 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2258 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   2259 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2260 
   2261 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   2262 
   2263 				/* blue */
   2264 				src2 = src1;
   2265 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   2266 
   2267 				dst2 = dst1;
   2268 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   2269 
   2270 				/* blend */
   2271 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2272 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2273 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   2274 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2275 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   2276 
   2277 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   2278 
   2279 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   2280 
   2281 				srcp += 4;
   2282 				dstp += 4;
   2283 			}, width);
   2284 			srcp += srcskip;
   2285 			dstp += dstskip;
   2286 		}
   2287 		_mm_empty();
   2288 	}
   2289 }
   2290 
   2291 /* fast RGB555->RGB555 blending with surface alpha */
   2292 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
   2293 {
   2294 	unsigned alpha = info->src->alpha;
   2295 	if(alpha == 128) {
   2296 		Blit16to16SurfaceAlpha128(info, 0xfbde);
   2297 	} else {
   2298 		int width = info->d_width;
   2299 		int height = info->d_height;
   2300 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   2301 		int srcskip = info->s_skip >> 1;
   2302 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   2303 		int dstskip = info->d_skip >> 1;
   2304 		Uint32 s, d;
   2305 
   2306 		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
   2307 
   2308 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
   2309 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
   2310 		alpha >>= 3;		/* downscale alpha to 5 bits */
   2311 
   2312 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   2313 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   2314 		/* position alpha to allow for mullo and mulhi on diff channels
   2315 		   to reduce the number of operations */
   2316 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
   2317 
   2318 		/* Setup the 555 color channel masks */
   2319 		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
   2320 		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
   2321 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
   2322 
   2323 		while(height--) {
   2324 			DUFFS_LOOP_QUATRO2(
   2325 			{
   2326 				s = *srcp++;
   2327 				d = *dstp;
   2328 				/*
   2329 				 * shift out the middle component (green) to
   2330 				 * the high 16 bits, and process all three RGB
   2331 				 * components at the same time.
   2332 				 */
   2333 				s = (s | s << 16) & 0x03e07c1f;
   2334 				d = (d | d << 16) & 0x03e07c1f;
   2335 				d += (s - d) * alpha >> 5;
   2336 				d &= 0x03e07c1f;
   2337 				*dstp++ = (Uint16)(d | d >> 16);
   2338 			},{
   2339 				s = *srcp++;
   2340 				d = *dstp;
   2341 				/*
   2342 				 * shift out the middle component (green) to
   2343 				 * the high 16 bits, and process all three RGB
   2344 				 * components at the same time.
   2345 				 */
   2346 				s = (s | s << 16) & 0x03e07c1f;
   2347 				d = (d | d << 16) & 0x03e07c1f;
   2348 				d += (s - d) * alpha >> 5;
   2349 				d &= 0x03e07c1f;
   2350 				*dstp++ = (Uint16)(d | d >> 16);
   2351 			        s = *srcp++;
   2352 				d = *dstp;
   2353 				/*
   2354 				 * shift out the middle component (green) to
   2355 				 * the high 16 bits, and process all three RGB
   2356 				 * components at the same time.
   2357 				 */
   2358 				s = (s | s << 16) & 0x03e07c1f;
   2359 				d = (d | d << 16) & 0x03e07c1f;
   2360 				d += (s - d) * alpha >> 5;
   2361 				d &= 0x03e07c1f;
   2362 				*dstp++ = (Uint16)(d | d >> 16);
   2363 			},{
   2364 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   2365 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   2366 
   2367 				/* red -- process the bits in place */
   2368 				src2 = src1;
   2369 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
   2370 
   2371 				dst2 = dst1;
   2372 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
   2373 
   2374 				/* blend */
   2375 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2376 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2377 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   2378 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2379 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
   2380 
   2381 				mm_res = dst2; /* RED -> mm_res */
   2382 
   2383 				/* green -- process the bits in place */
   2384 				src2 = src1;
   2385 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   2386 
   2387 				dst2 = dst1;
   2388 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   2389 
   2390 				/* blend */
   2391 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2392 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2393 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   2394 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2395 
   2396 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   2397 
   2398 				/* blue */
   2399 				src2 = src1; /* src -> src2 */
   2400 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   2401 
   2402 				dst2 = dst1; /* dst -> dst2 */
   2403 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   2404 
   2405 				/* blend */
   2406 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   2407 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   2408 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   2409 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   2410 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   2411 
   2412 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   2413 
   2414 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   2415 
   2416 				srcp += 4;
   2417 				dstp += 4;
   2418 			}, width);
   2419 			srcp += srcskip;
   2420 			dstp += dstskip;
   2421 		}
   2422 		_mm_empty();
   2423 	}
   2424 }
   2425 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   2426 
   2427 /* fast RGB565->RGB565 blending with surface alpha */
   2428 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
   2429 {
   2430 	unsigned alpha = info->src->alpha;
   2431 	if(alpha == 128) {
   2432 		Blit16to16SurfaceAlpha128(info, 0xf7de);
   2433 	} else {
   2434 		int width = info->d_width;
   2435 		int height = info->d_height;
   2436 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   2437 		int srcskip = info->s_skip >> 1;
   2438 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   2439 		int dstskip = info->d_skip >> 1;
   2440 		alpha >>= 3;	/* downscale alpha to 5 bits */
   2441 
   2442 		while(height--) {
   2443 			DUFFS_LOOP4({
   2444 				Uint32 s = *srcp++;
   2445 				Uint32 d = *dstp;
   2446 				/*
   2447 				 * shift out the middle component (green) to
   2448 				 * the high 16 bits, and process all three RGB
   2449 				 * components at the same time.
   2450 				 */
   2451 				s = (s | s << 16) & 0x07e0f81f;
   2452 				d = (d | d << 16) & 0x07e0f81f;
   2453 				d += (s - d) * alpha >> 5;
   2454 				d &= 0x07e0f81f;
   2455 				*dstp++ = (Uint16)(d | d >> 16);
   2456 			}, width);
   2457 			srcp += srcskip;
   2458 			dstp += dstskip;
   2459 		}
   2460 	}
   2461 }
   2462 
   2463 /* fast RGB555->RGB555 blending with surface alpha */
   2464 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
   2465 {
   2466 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
   2467 	if(alpha == 128) {
   2468 		Blit16to16SurfaceAlpha128(info, 0xfbde);
   2469 	} else {
   2470 		int width = info->d_width;
   2471 		int height = info->d_height;
   2472 		Uint16 *srcp = (Uint16 *)info->s_pixels;
   2473 		int srcskip = info->s_skip >> 1;
   2474 		Uint16 *dstp = (Uint16 *)info->d_pixels;
   2475 		int dstskip = info->d_skip >> 1;
   2476 		alpha >>= 3;		/* downscale alpha to 5 bits */
   2477 
   2478 		while(height--) {
   2479 			DUFFS_LOOP4({
   2480 				Uint32 s = *srcp++;
   2481 				Uint32 d = *dstp;
   2482 				/*
   2483 				 * shift out the middle component (green) to
   2484 				 * the high 16 bits, and process all three RGB
   2485 				 * components at the same time.
   2486 				 */
   2487 				s = (s | s << 16) & 0x03e07c1f;
   2488 				d = (d | d << 16) & 0x03e07c1f;
   2489 				d += (s - d) * alpha >> 5;
   2490 				d &= 0x03e07c1f;
   2491 				*dstp++ = (Uint16)(d | d >> 16);
   2492 			}, width);
   2493 			srcp += srcskip;
   2494 			dstp += dstskip;
   2495 		}
   2496 	}
   2497 }
   2498 
   2499 /* fast ARGB8888->RGB565 blending with pixel alpha */
   2500 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
   2501 {
   2502 	int width = info->d_width;
   2503 	int height = info->d_height;
   2504 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   2505 	int srcskip = info->s_skip >> 2;
   2506 	Uint16 *dstp = (Uint16 *)info->d_pixels;
   2507 	int dstskip = info->d_skip >> 1;
   2508 
   2509 	while(height--) {
   2510 	    DUFFS_LOOP4({
   2511 		Uint32 s = *srcp;
   2512 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
   2513 		/* FIXME: Here we special-case opaque alpha since the
   2514 		   compositioning used (>>8 instead of /255) doesn't handle
   2515 		   it correctly. Also special-case alpha=0 for speed?
   2516 		   Benchmark this! */
   2517 		if(alpha) {
   2518 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
   2519 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
   2520 		  } else {
   2521 		    Uint32 d = *dstp;
   2522 		    /*
   2523 		     * convert source and destination to G0RAB65565
   2524 		     * and blend all components at the same time
   2525 		     */
   2526 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
   2527 		      + (s >> 3 & 0x1f);
   2528 		    d = (d | d << 16) & 0x07e0f81f;
   2529 		    d += (s - d) * alpha >> 5;
   2530 		    d &= 0x07e0f81f;
   2531 		    *dstp = (Uint16)(d | d >> 16);
   2532 		  }
   2533 		}
   2534 		srcp++;
   2535 		dstp++;
   2536 	    }, width);
   2537 	    srcp += srcskip;
   2538 	    dstp += dstskip;
   2539 	}
   2540 }
   2541 
   2542 /* fast ARGB8888->RGB555 blending with pixel alpha */
   2543 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
   2544 {
   2545 	int width = info->d_width;
   2546 	int height = info->d_height;
   2547 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   2548 	int srcskip = info->s_skip >> 2;
   2549 	Uint16 *dstp = (Uint16 *)info->d_pixels;
   2550 	int dstskip = info->d_skip >> 1;
   2551 
   2552 	while(height--) {
   2553 	    DUFFS_LOOP4({
   2554 		unsigned alpha;
   2555 		Uint32 s = *srcp;
   2556 		alpha = s >> 27; /* downscale alpha to 5 bits */
   2557 		/* FIXME: Here we special-case opaque alpha since the
   2558 		   compositioning used (>>8 instead of /255) doesn't handle
   2559 		   it correctly. Also special-case alpha=0 for speed?
   2560 		   Benchmark this! */
   2561 		if(alpha) {
   2562 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
   2563 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
   2564 		  } else {
   2565 		    Uint32 d = *dstp;
   2566 		    /*
   2567 		     * convert source and destination to G0RAB65565
   2568 		     * and blend all components at the same time
   2569 		     */
   2570 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
   2571 		      + (s >> 3 & 0x1f);
   2572 		    d = (d | d << 16) & 0x03e07c1f;
   2573 		    d += (s - d) * alpha >> 5;
   2574 		    d &= 0x03e07c1f;
   2575 		    *dstp = (Uint16)(d | d >> 16);
   2576 		  }
   2577 		}
   2578 		srcp++;
   2579 		dstp++;
   2580 	    }, width);
   2581 	    srcp += srcskip;
   2582 	    dstp += dstskip;
   2583 	}
   2584 }
   2585 
   2586 /* General (slow) N->N blending with per-surface alpha */
   2587 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
   2588 {
   2589 	int width = info->d_width;
   2590 	int height = info->d_height;
   2591 	Uint8 *src = info->s_pixels;
   2592 	int srcskip = info->s_skip;
   2593 	Uint8 *dst = info->d_pixels;
   2594 	int dstskip = info->d_skip;
   2595 	SDL_PixelFormat *srcfmt = info->src;
   2596 	SDL_PixelFormat *dstfmt = info->dst;
   2597 	int srcbpp = srcfmt->BytesPerPixel;
   2598 	int dstbpp = dstfmt->BytesPerPixel;
   2599 	unsigned sA = srcfmt->alpha;
   2600 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   2601 
   2602 	if(sA) {
   2603 	  while ( height-- ) {
   2604 	    DUFFS_LOOP4(
   2605 	    {
   2606 		Uint32 Pixel;
   2607 		unsigned sR;
   2608 		unsigned sG;
   2609 		unsigned sB;
   2610 		unsigned dR;
   2611 		unsigned dG;
   2612 		unsigned dB;
   2613 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   2614 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
   2615 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   2616 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   2617 		src += srcbpp;
   2618 		dst += dstbpp;
   2619 	    },
   2620 	    width);
   2621 	    src += srcskip;
   2622 	    dst += dstskip;
   2623 	  }
   2624 	}
   2625 }
   2626 
   2627 /* General (slow) colorkeyed N->N blending with per-surface alpha */
   2628 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
   2629 {
   2630 	int width = info->d_width;
   2631 	int height = info->d_height;
   2632 	Uint8 *src = info->s_pixels;
   2633 	int srcskip = info->s_skip;
   2634 	Uint8 *dst = info->d_pixels;
   2635 	int dstskip = info->d_skip;
   2636 	SDL_PixelFormat *srcfmt = info->src;
   2637 	SDL_PixelFormat *dstfmt = info->dst;
   2638 	Uint32 ckey = srcfmt->colorkey;
   2639 	int srcbpp = srcfmt->BytesPerPixel;
   2640 	int dstbpp = dstfmt->BytesPerPixel;
   2641 	unsigned sA = srcfmt->alpha;
   2642 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   2643 
   2644 	while ( height-- ) {
   2645 	    DUFFS_LOOP4(
   2646 	    {
   2647 		Uint32 Pixel;
   2648 		unsigned sR;
   2649 		unsigned sG;
   2650 		unsigned sB;
   2651 		unsigned dR;
   2652 		unsigned dG;
   2653 		unsigned dB;
   2654 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
   2655 		if(sA && Pixel != ckey) {
   2656 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
   2657 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
   2658 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   2659 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   2660 		}
   2661 		src += srcbpp;
   2662 		dst += dstbpp;
   2663 	    },
   2664 	    width);
   2665 	    src += srcskip;
   2666 	    dst += dstskip;
   2667 	}
   2668 }
   2669 
   2670 /* General (slow) N->N blending with pixel alpha */
   2671 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
   2672 {
   2673 	int width = info->d_width;
   2674 	int height = info->d_height;
   2675 	Uint8 *src = info->s_pixels;
   2676 	int srcskip = info->s_skip;
   2677 	Uint8 *dst = info->d_pixels;
   2678 	int dstskip = info->d_skip;
   2679 	SDL_PixelFormat *srcfmt = info->src;
   2680 	SDL_PixelFormat *dstfmt = info->dst;
   2681 
   2682 	int  srcbpp;
   2683 	int  dstbpp;
   2684 
   2685 	/* Set up some basic variables */
   2686 	srcbpp = srcfmt->BytesPerPixel;
   2687 	dstbpp = dstfmt->BytesPerPixel;
   2688 
   2689 	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
   2690 	   quite right. for <8bpp source alpha, it gets them very wrong
   2691 	   (check all macros!)
   2692 	   It is unclear whether there is a good general solution that doesn't
   2693 	   need a branch (or a divide). */
   2694 	while ( height-- ) {
   2695 	    DUFFS_LOOP4(
   2696 	    {
   2697 		Uint32 Pixel;
   2698 		unsigned sR;
   2699 		unsigned sG;
   2700 		unsigned sB;
   2701 		unsigned dR;
   2702 		unsigned dG;
   2703 		unsigned dB;
   2704 		unsigned sA;
   2705 		unsigned dA;
   2706 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
   2707 		if(sA) {
   2708 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
   2709 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   2710 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   2711 		}
   2712 		src += srcbpp;
   2713 		dst += dstbpp;
   2714 	    },
   2715 	    width);
   2716 	    src += srcskip;
   2717 	    dst += dstskip;
   2718 	}
   2719 }
   2720 
   2721 
   2722 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
   2723 {
   2724     SDL_PixelFormat *sf = surface->format;
   2725     SDL_PixelFormat *df = surface->map->dst->format;
   2726 
   2727     if(sf->Amask == 0) {
   2728 	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
   2729 	    if(df->BytesPerPixel == 1)
   2730 		return BlitNto1SurfaceAlphaKey;
   2731 	    else
   2732 #if SDL_ALTIVEC_BLITTERS
   2733 	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
   2734 	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
   2735             return Blit32to32SurfaceAlphaKeyAltivec;
   2736         else
   2737 #endif
   2738             return BlitNtoNSurfaceAlphaKey;
   2739 	} else {
   2740 	    /* Per-surface alpha blits */
   2741 	    switch(df->BytesPerPixel) {
   2742 	    case 1:
   2743 		return BlitNto1SurfaceAlpha;
   2744 
   2745 	    case 2:
   2746 		if(surface->map->identity) {
   2747 		    if(df->Gmask == 0x7e0)
   2748 		    {
   2749 #if MMX_ASMBLIT
   2750 		if(SDL_HasMMX())
   2751 			return Blit565to565SurfaceAlphaMMX;
   2752 		else
   2753 #endif
   2754 			return Blit565to565SurfaceAlpha;
   2755 		    }
   2756 		    else if(df->Gmask == 0x3e0)
   2757 		    {
   2758 #if MMX_ASMBLIT
   2759 		if(SDL_HasMMX())
   2760 			return Blit555to555SurfaceAlphaMMX;
   2761 		else
   2762 #endif
   2763 			return Blit555to555SurfaceAlpha;
   2764 		    }
   2765 		}
   2766 		return BlitNtoNSurfaceAlpha;
   2767 
   2768 	    case 4:
   2769 		if(sf->Rmask == df->Rmask
   2770 		   && sf->Gmask == df->Gmask
   2771 		   && sf->Bmask == df->Bmask
   2772 		   && sf->BytesPerPixel == 4)
   2773 		{
   2774 #if MMX_ASMBLIT
   2775 			if(sf->Rshift % 8 == 0
   2776 			   && sf->Gshift % 8 == 0
   2777 			   && sf->Bshift % 8 == 0
   2778 			   && SDL_HasMMX())
   2779 			    return BlitRGBtoRGBSurfaceAlphaMMX;
   2780 #endif
   2781 			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
   2782 			{
   2783 #if SDL_ALTIVEC_BLITTERS
   2784 				if(!(surface->map->dst->flags & SDL_HWSURFACE)
   2785 					&& SDL_HasAltiVec())
   2786 					return BlitRGBtoRGBSurfaceAlphaAltivec;
   2787 #endif
   2788 				return BlitRGBtoRGBSurfaceAlpha;
   2789 			}
   2790 		}
   2791 #if SDL_ALTIVEC_BLITTERS
   2792 		if((sf->BytesPerPixel == 4) &&
   2793 		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
   2794 			return Blit32to32SurfaceAlphaAltivec;
   2795 		else
   2796 #endif
   2797 			return BlitNtoNSurfaceAlpha;
   2798 
   2799 	    case 3:
   2800 	    default:
   2801 		return BlitNtoNSurfaceAlpha;
   2802 	    }
   2803 	}
   2804     } else {
   2805 	/* Per-pixel alpha blits */
   2806 	switch(df->BytesPerPixel) {
   2807 	case 1:
   2808 	    return BlitNto1PixelAlpha;
   2809 
   2810 	case 2:
   2811 #if SDL_ALTIVEC_BLITTERS
   2812 	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
   2813            df->Gmask == 0x7e0 &&
   2814 	   df->Bmask == 0x1f && SDL_HasAltiVec())
   2815             return Blit32to565PixelAlphaAltivec;
   2816         else
   2817 #endif
   2818 	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
   2819 	       && sf->Gmask == 0xff00
   2820 	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
   2821 		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
   2822 		if(df->Gmask == 0x7e0)
   2823 		    return BlitARGBto565PixelAlpha;
   2824 		else if(df->Gmask == 0x3e0)
   2825 		    return BlitARGBto555PixelAlpha;
   2826 	    }
   2827 	    return BlitNtoNPixelAlpha;
   2828 
   2829 	case 4:
   2830 	    if(sf->Rmask == df->Rmask
   2831 	       && sf->Gmask == df->Gmask
   2832 	       && sf->Bmask == df->Bmask
   2833 	       && sf->BytesPerPixel == 4)
   2834 	    {
   2835 #if MMX_ASMBLIT
   2836 		if(sf->Rshift % 8 == 0
   2837 		   && sf->Gshift % 8 == 0
   2838 		   && sf->Bshift % 8 == 0
   2839 		   && sf->Ashift % 8 == 0
   2840 		   && sf->Aloss == 0)
   2841 		{
   2842 			if(SDL_Has3DNow())
   2843 				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
   2844 			if(SDL_HasMMX())
   2845 				return BlitRGBtoRGBPixelAlphaMMX;
   2846 		}
   2847 #endif
   2848 		if(sf->Amask == 0xff000000)
   2849 		{
   2850 #if SDL_ALTIVEC_BLITTERS
   2851 			if(!(surface->map->dst->flags & SDL_HWSURFACE)
   2852 				&& SDL_HasAltiVec())
   2853 				return BlitRGBtoRGBPixelAlphaAltivec;
   2854 #endif
   2855 			return BlitRGBtoRGBPixelAlpha;
   2856 		}
   2857 	    }
   2858 #if SDL_ALTIVEC_BLITTERS
   2859 	    if (sf->Amask && sf->BytesPerPixel == 4 &&
   2860 	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
   2861 		return Blit32to32PixelAlphaAltivec;
   2862 	    else
   2863 #endif
   2864 		return BlitNtoNPixelAlpha;
   2865 
   2866 	case 3:
   2867 	default:
   2868 	    return BlitNtoNPixelAlpha;
   2869 	}
   2870     }
   2871 }
   2872 
   2873