1 /* 2 SDL - Simple DirectMedia Layer 3 Copyright (C) 1997-2012 Sam Lantinga 4 5 This library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 This library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with this library; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 19 Sam Lantinga 20 slouken (at) libsdl.org 21 */ 22 #include "SDL_config.h" 23 24 #include "SDL_video.h" 25 #include "SDL_blit.h" 26 27 /* 28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on. 29 Checking if _mm_free is #defined in malloc.h is is the only way to 30 determine if the Processor Pack is installed, as far as I can tell. 31 */ 32 33 #if SDL_ASSEMBLY_ROUTINES 34 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 35 /* forced MMX to 0...it breaks on most compilers now. --ryan. */ 36 # define MMX_ASMBLIT 0 37 # define GCC_ASMBLIT 0 38 # elif defined(_MSC_VER) && defined(_M_IX86) 39 # if (_MSC_VER <= 1200) 40 # include <malloc.h> 41 # if defined(_mm_free) 42 # define HAVE_MMINTRIN_H 1 43 # endif 44 # else /* Visual Studio > VC6 always has mmintrin.h */ 45 # define HAVE_MMINTRIN_H 1 46 # endif 47 # if HAVE_MMINTRIN_H 48 # define MMX_ASMBLIT 1 49 # define MSVC_ASMBLIT 1 50 # endif 51 # endif 52 #endif /* SDL_ASSEMBLY_ROUTINES */ 53 54 /* Function to check the CPU flags */ 55 #include "SDL_cpuinfo.h" 56 #if GCC_ASMBLIT 57 #include "mmx.h" 58 #elif MSVC_ASMBLIT 59 #include <mmintrin.h> 60 #include <mm3dnow.h> 61 #endif 62 63 /* Functions to perform alpha blended blitting */ 64 65 /* N->1 blending with per-surface alpha */ 66 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info) 67 { 68 int width = info->d_width; 69 int height = info->d_height; 70 Uint8 *src = info->s_pixels; 71 int srcskip = info->s_skip; 72 Uint8 *dst = info->d_pixels; 73 int dstskip = info->d_skip; 74 Uint8 *palmap = info->table; 75 SDL_PixelFormat *srcfmt = info->src; 76 SDL_PixelFormat *dstfmt = info->dst; 77 int srcbpp = srcfmt->BytesPerPixel; 78 79 const unsigned A = srcfmt->alpha; 80 81 while ( height-- ) { 82 DUFFS_LOOP4( 83 { 84 Uint32 Pixel; 85 unsigned sR; 86 unsigned sG; 87 unsigned sB; 88 unsigned dR; 89 unsigned dG; 90 unsigned dB; 91 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 92 dR = dstfmt->palette->colors[*dst].r; 93 dG = dstfmt->palette->colors[*dst].g; 94 dB = dstfmt->palette->colors[*dst].b; 95 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB); 96 dR &= 0xff; 97 dG &= 0xff; 98 dB &= 0xff; 99 /* Pack RGB into 8bit pixel */ 100 if ( palmap == NULL ) { 101 *dst =((dR>>5)<<(3+2))| 102 ((dG>>5)<<(2))| 103 ((dB>>6)<<(0)); 104 } else { 105 *dst = palmap[((dR>>5)<<(3+2))| 106 ((dG>>5)<<(2)) | 107 ((dB>>6)<<(0))]; 108 } 109 dst++; 110 src += srcbpp; 111 }, 112 width); 113 src += srcskip; 114 dst += dstskip; 115 } 116 } 117 118 /* N->1 blending with pixel alpha */ 119 static void BlitNto1PixelAlpha(SDL_BlitInfo *info) 120 { 121 int width = info->d_width; 122 int height = info->d_height; 123 Uint8 *src = info->s_pixels; 124 int srcskip = info->s_skip; 125 Uint8 *dst = info->d_pixels; 126 int dstskip = info->d_skip; 127 Uint8 *palmap = info->table; 128 SDL_PixelFormat *srcfmt = info->src; 129 SDL_PixelFormat *dstfmt = info->dst; 130 int srcbpp = srcfmt->BytesPerPixel; 131 132 /* FIXME: fix alpha bit field expansion here too? */ 133 while ( height-- ) { 134 DUFFS_LOOP4( 135 { 136 Uint32 Pixel; 137 unsigned sR; 138 unsigned sG; 139 unsigned sB; 140 unsigned sA; 141 unsigned dR; 142 unsigned dG; 143 unsigned dB; 144 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA); 145 dR = dstfmt->palette->colors[*dst].r; 146 dG = dstfmt->palette->colors[*dst].g; 147 dB = dstfmt->palette->colors[*dst].b; 148 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); 149 dR &= 0xff; 150 dG &= 0xff; 151 dB &= 0xff; 152 /* Pack RGB into 8bit pixel */ 153 if ( palmap == NULL ) { 154 *dst =((dR>>5)<<(3+2))| 155 ((dG>>5)<<(2))| 156 ((dB>>6)<<(0)); 157 } else { 158 *dst = palmap[((dR>>5)<<(3+2))| 159 ((dG>>5)<<(2)) | 160 ((dB>>6)<<(0)) ]; 161 } 162 dst++; 163 src += srcbpp; 164 }, 165 width); 166 src += srcskip; 167 dst += dstskip; 168 } 169 } 170 171 /* colorkeyed N->1 blending with per-surface alpha */ 172 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info) 173 { 174 int width = info->d_width; 175 int height = info->d_height; 176 Uint8 *src = info->s_pixels; 177 int srcskip = info->s_skip; 178 Uint8 *dst = info->d_pixels; 179 int dstskip = info->d_skip; 180 Uint8 *palmap = info->table; 181 SDL_PixelFormat *srcfmt = info->src; 182 SDL_PixelFormat *dstfmt = info->dst; 183 int srcbpp = srcfmt->BytesPerPixel; 184 Uint32 ckey = srcfmt->colorkey; 185 186 const int A = srcfmt->alpha; 187 188 while ( height-- ) { 189 DUFFS_LOOP( 190 { 191 Uint32 Pixel; 192 unsigned sR; 193 unsigned sG; 194 unsigned sB; 195 unsigned dR; 196 unsigned dG; 197 unsigned dB; 198 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 199 if ( Pixel != ckey ) { 200 dR = dstfmt->palette->colors[*dst].r; 201 dG = dstfmt->palette->colors[*dst].g; 202 dB = dstfmt->palette->colors[*dst].b; 203 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB); 204 dR &= 0xff; 205 dG &= 0xff; 206 dB &= 0xff; 207 /* Pack RGB into 8bit pixel */ 208 if ( palmap == NULL ) { 209 *dst =((dR>>5)<<(3+2))| 210 ((dG>>5)<<(2)) | 211 ((dB>>6)<<(0)); 212 } else { 213 *dst = palmap[((dR>>5)<<(3+2))| 214 ((dG>>5)<<(2)) | 215 ((dB>>6)<<(0)) ]; 216 } 217 } 218 dst++; 219 src += srcbpp; 220 }, 221 width); 222 src += srcskip; 223 dst += dstskip; 224 } 225 } 226 227 #if GCC_ASMBLIT 228 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 229 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) 230 { 231 int width = info->d_width; 232 int height = info->d_height; 233 Uint32 *srcp = (Uint32 *)info->s_pixels; 234 int srcskip = info->s_skip >> 2; 235 Uint32 *dstp = (Uint32 *)info->d_pixels; 236 int dstskip = info->d_skip >> 2; 237 Uint32 dalpha = info->dst->Amask; 238 Uint64 load; 239 240 load = 0x00fefefe00fefefeULL;/* alpha128 mask */ 241 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */ 242 load = 0x0001010100010101ULL;/* !alpha128 mask */ 243 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */ 244 movd_m2r(dalpha, mm7); /* dst alpha mask */ 245 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ 246 while(height--) { 247 DUFFS_LOOP_DOUBLE2( 248 { 249 Uint32 s = *srcp++; 250 Uint32 d = *dstp; 251 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) 252 + (s & d & 0x00010101)) | dalpha; 253 },{ 254 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ 255 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ 256 257 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */ 258 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */ 259 260 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */ 261 pand_r2r(mm4, mm5); /* src & mask -> mm5 */ 262 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */ 263 pand_r2r(mm1, mm2); /* src & dst -> mm2 */ 264 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */ 265 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */ 266 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */ 267 268 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ 269 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */ 270 dstp += 2; 271 srcp += 2; 272 }, width); 273 srcp += srcskip; 274 dstp += dstskip; 275 } 276 emms(); 277 } 278 279 /* fast RGB888->(A)RGB888 blending with surface alpha */ 280 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) 281 { 282 SDL_PixelFormat* df = info->dst; 283 unsigned alpha = info->src->alpha; 284 285 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { 286 /* only call a128 version when R,G,B occupy lower bits */ 287 BlitRGBtoRGBSurfaceAlpha128MMX(info); 288 } else { 289 int width = info->d_width; 290 int height = info->d_height; 291 Uint32 *srcp = (Uint32 *)info->s_pixels; 292 int srcskip = info->s_skip >> 2; 293 Uint32 *dstp = (Uint32 *)info->d_pixels; 294 int dstskip = info->d_skip >> 2; 295 296 pxor_r2r(mm5, mm5); /* 0 -> mm5 */ 297 /* form the alpha mult */ 298 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */ 299 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ 300 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ 301 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift); 302 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */ 303 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */ 304 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */ 305 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */ 306 movd_m2r(df->Amask, mm7); /* dst alpha mask */ 307 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ 308 309 while(height--) { 310 DUFFS_LOOP_DOUBLE2({ 311 /* One Pixel Blend */ 312 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ 313 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ 314 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */ 315 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */ 316 317 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ 318 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ 319 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ 320 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ 321 322 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */ 323 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ 324 movd_r2m(mm2, *dstp);/* mm2 -> pixel */ 325 ++srcp; 326 ++dstp; 327 },{ 328 /* Two Pixels Blend */ 329 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/ 330 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ 331 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */ 332 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ 333 334 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */ 335 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */ 336 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */ 337 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */ 338 339 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */ 340 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */ 341 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */ 342 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */ 343 344 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */ 345 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ 346 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ 347 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */ 348 349 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */ 350 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */ 351 352 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */ 353 354 srcp += 2; 355 dstp += 2; 356 }, width); 357 srcp += srcskip; 358 dstp += dstskip; 359 } 360 emms(); 361 } 362 } 363 364 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ 365 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) 366 { 367 int width = info->d_width; 368 int height = info->d_height; 369 Uint32 *srcp = (Uint32 *)info->s_pixels; 370 int srcskip = info->s_skip >> 2; 371 Uint32 *dstp = (Uint32 *)info->d_pixels; 372 int dstskip = info->d_skip >> 2; 373 SDL_PixelFormat* sf = info->src; 374 Uint32 amask = sf->Amask; 375 376 pxor_r2r(mm6, mm6); /* 0 -> mm6 */ 377 /* form multiplication mask */ 378 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */ 379 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */ 380 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */ 381 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */ 382 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */ 383 /* form channel masks */ 384 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */ 385 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */ 386 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */ 387 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */ 388 /* get alpha channel shift */ 389 __asm__ __volatile__ ( 390 "movd %0, %%mm5" 391 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */ 392 393 while(height--) { 394 DUFFS_LOOP4({ 395 Uint32 alpha = *srcp & amask; 396 /* FIXME: Here we special-case opaque alpha since the 397 compositioning used (>>8 instead of /255) doesn't handle 398 it correctly. Also special-case alpha=0 for speed? 399 Benchmark this! */ 400 if(alpha == 0) { 401 /* do nothing */ 402 } else if(alpha == amask) { 403 /* opaque alpha -- copy RGB, keep dst alpha */ 404 /* using MMX here to free up regular registers for other things */ 405 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ 406 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ 407 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */ 408 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */ 409 por_r2r(mm1, mm2); /* src | dst -> mm2 */ 410 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */ 411 } else { 412 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ 413 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */ 414 415 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ 416 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */ 417 418 __asm__ __volatile__ ( 419 "movd %0, %%mm4" 420 : : "r" (alpha) ); /* 0000A000 -> mm4 */ 421 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */ 422 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ 423 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ 424 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */ 425 426 /* blend */ 427 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ 428 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ 429 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */ 430 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ 431 432 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */ 433 movd_r2m(mm2, *dstp);/* mm2 -> dst */ 434 } 435 ++srcp; 436 ++dstp; 437 }, width); 438 srcp += srcskip; 439 dstp += dstskip; 440 } 441 emms(); 442 } 443 /* End GCC_ASMBLIT */ 444 445 #elif MSVC_ASMBLIT 446 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 447 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) 448 { 449 int width = info->d_width; 450 int height = info->d_height; 451 Uint32 *srcp = (Uint32 *)info->s_pixels; 452 int srcskip = info->s_skip >> 2; 453 Uint32 *dstp = (Uint32 *)info->d_pixels; 454 int dstskip = info->d_skip >> 2; 455 Uint32 dalpha = info->dst->Amask; 456 457 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta; 458 459 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */ 460 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */ 461 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ 462 463 while (height--) { 464 int n = width; 465 if ( n & 1 ) { 466 Uint32 s = *srcp++; 467 Uint32 d = *dstp; 468 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) 469 + (s & d & 0x00010101)) | dalpha; 470 n--; 471 } 472 473 for (n >>= 1; n > 0; --n) { 474 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */ 475 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ 476 477 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */ 478 src2 = src1; /* 2 x src -> src2(ARGBARGB) */ 479 480 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */ 481 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */ 482 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */ 483 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */ 484 485 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */ 486 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */ 487 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */ 488 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */ 489 490 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */ 491 dstp += 2; 492 srcp += 2; 493 } 494 495 srcp += srcskip; 496 dstp += dstskip; 497 } 498 _mm_empty(); 499 } 500 501 /* fast RGB888->(A)RGB888 blending with surface alpha */ 502 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) 503 { 504 SDL_PixelFormat* df = info->dst; 505 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask; 506 unsigned alpha = info->src->alpha; 507 508 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { 509 /* only call a128 version when R,G,B occupy lower bits */ 510 BlitRGBtoRGBSurfaceAlpha128MMX(info); 511 } else { 512 int width = info->d_width; 513 int height = info->d_height; 514 Uint32 *srcp = (Uint32 *)info->s_pixels; 515 int srcskip = info->s_skip >> 2; 516 Uint32 *dstp = (Uint32 *)info->d_pixels; 517 int dstskip = info->d_skip >> 2; 518 Uint32 dalpha = df->Amask; 519 Uint32 amult; 520 521 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta; 522 523 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 524 /* form the alpha mult */ 525 amult = alpha | (alpha << 8); 526 amult = amult | (amult << 16); 527 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift); 528 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */ 529 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */ 530 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */ 531 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ 532 533 while (height--) { 534 int n = width; 535 if (n & 1) { 536 /* One Pixel Blend */ 537 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/ 538 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */ 539 540 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ 541 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ 542 543 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */ 544 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 545 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ 546 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */ 547 548 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ 549 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ 550 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ 551 552 ++srcp; 553 ++dstp; 554 555 n--; 556 } 557 558 for (n >>= 1; n > 0; --n) { 559 /* Two Pixels Blend */ 560 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/ 561 src2 = src1; /* 2 x src -> src2(ARGBARGB) */ 562 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */ 563 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */ 564 565 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */ 566 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ 567 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */ 568 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */ 569 570 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ 571 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */ 572 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */ 573 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */ 574 575 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */ 576 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 577 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ 578 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */ 579 580 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */ 581 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ 582 583 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */ 584 585 srcp += 2; 586 dstp += 2; 587 } 588 srcp += srcskip; 589 dstp += dstskip; 590 } 591 _mm_empty(); 592 } 593 } 594 595 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ 596 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) 597 { 598 int width = info->d_width; 599 int height = info->d_height; 600 Uint32 *srcp = (Uint32 *)info->s_pixels; 601 int srcskip = info->s_skip >> 2; 602 Uint32 *dstp = (Uint32 *)info->d_pixels; 603 int dstskip = info->d_skip >> 2; 604 SDL_PixelFormat* sf = info->src; 605 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; 606 Uint32 amask = sf->Amask; 607 Uint32 ashift = sf->Ashift; 608 Uint64 multmask; 609 610 __m64 src1, dst1, mm_alpha, mm_zero, dmask; 611 612 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 613 multmask = ~(0xFFFFi64 << (ashift * 2)); 614 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */ 615 616 while(height--) { 617 DUFFS_LOOP4({ 618 Uint32 alpha = *srcp & amask; 619 if (alpha == 0) { 620 /* do nothing */ 621 } else if (alpha == amask) { 622 /* opaque alpha -- copy RGB, keep dst alpha */ 623 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); 624 } else { 625 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ 626 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ 627 628 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ 629 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ 630 631 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ 632 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ 633 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 634 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ 635 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ 636 637 /* blend */ 638 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ 639 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */ 640 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ 641 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */ 642 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ 643 644 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ 645 } 646 ++srcp; 647 ++dstp; 648 }, width); 649 srcp += srcskip; 650 dstp += dstskip; 651 } 652 _mm_empty(); 653 } 654 /* End MSVC_ASMBLIT */ 655 656 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ 657 658 #if SDL_ALTIVEC_BLITTERS 659 #if __MWERKS__ 660 #pragma altivec_model on 661 #endif 662 #if HAVE_ALTIVEC_H 663 #include <altivec.h> 664 #endif 665 #include <assert.h> 666 667 #if (defined(__MACOSX__) && (__GNUC__ < 4)) 668 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ 669 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p ) 670 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \ 671 (vector unsigned short) ( a,b,c,d,e,f,g,h ) 672 #else 673 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ 674 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p } 675 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \ 676 (vector unsigned short) { a,b,c,d,e,f,g,h } 677 #endif 678 679 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F) 680 #define VECPRINT(msg, v) do { \ 681 vector unsigned int tmpvec = (vector unsigned int)(v); \ 682 unsigned int *vp = (unsigned int *)&tmpvec; \ 683 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \ 684 } while (0) 685 686 /* the permuation vector that takes the high bytes out of all the appropriate shorts 687 (vector unsigned char)( 688 0x00, 0x10, 0x02, 0x12, 689 0x04, 0x14, 0x06, 0x16, 690 0x08, 0x18, 0x0A, 0x1A, 691 0x0C, 0x1C, 0x0E, 0x1E ); 692 */ 693 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F))) 694 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12))) 695 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24())) 696 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \ 697 ? vec_lvsl(0, src) \ 698 : vec_add(vec_lvsl(8, src), vec_splat_u8(8))) 699 700 701 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \ 702 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \ 703 vector unsigned short vtemp1 = vec_mule(vs, valpha); \ 704 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \ 705 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \ 706 /* valpha2 is 255-alpha */ \ 707 vector unsigned char valpha2 = vec_nor(valpha, valpha); \ 708 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \ 709 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \ 710 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \ 711 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \ 712 /* add source and dest */ \ 713 vtemp1 = vec_add(vtemp1, vtemp3); \ 714 vtemp2 = vec_add(vtemp2, vtemp4); \ 715 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \ 716 vtemp1 = vec_add(vtemp1, v1_16); \ 717 vtemp3 = vec_sr(vtemp1, v8_16); \ 718 vtemp1 = vec_add(vtemp1, vtemp3); \ 719 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \ 720 vtemp2 = vec_add(vtemp2, v1_16); \ 721 vtemp4 = vec_sr(vtemp2, v8_16); \ 722 vtemp2 = vec_add(vtemp2, vtemp4); \ 723 /* (>>8) and get ARGBARGBARGBARGB */ \ 724 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \ 725 } while (0) 726 727 /* Calculate the permute vector used for 32->32 swizzling */ 728 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt, 729 const SDL_PixelFormat *dstfmt) 730 { 731 /* 732 * We have to assume that the bits that aren't used by other 733 * colors is alpha, and it's one complete byte, since some formats 734 * leave alpha with a zero mask, but we should still swizzle the bits. 735 */ 736 /* ARGB */ 737 const static struct SDL_PixelFormat default_pixel_format = { 738 NULL, 0, 0, 739 0, 0, 0, 0, 740 16, 8, 0, 24, 741 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, 742 0, 0}; 743 if (!srcfmt) { 744 srcfmt = &default_pixel_format; 745 } 746 if (!dstfmt) { 747 dstfmt = &default_pixel_format; 748 } 749 const vector unsigned char plus = VECUINT8_LITERAL 750 ( 0x00, 0x00, 0x00, 0x00, 751 0x04, 0x04, 0x04, 0x04, 752 0x08, 0x08, 0x08, 0x08, 753 0x0C, 0x0C, 0x0C, 0x0C ); 754 vector unsigned char vswiz; 755 vector unsigned int srcvec; 756 #define RESHIFT(X) (3 - ((X) >> 3)) 757 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift); 758 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift); 759 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift); 760 Uint32 amask; 761 /* Use zero for alpha if either surface doesn't have alpha */ 762 if (dstfmt->Amask) { 763 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift); 764 } else { 765 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF); 766 } 767 #undef RESHIFT 768 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask); 769 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0)); 770 return(vswiz); 771 } 772 773 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info) 774 { 775 int height = info->d_height; 776 Uint8 *src = (Uint8 *)info->s_pixels; 777 int srcskip = info->s_skip; 778 Uint8 *dst = (Uint8 *)info->d_pixels; 779 int dstskip = info->d_skip; 780 SDL_PixelFormat *srcfmt = info->src; 781 782 vector unsigned char v0 = vec_splat_u8(0); 783 vector unsigned short v8_16 = vec_splat_u16(8); 784 vector unsigned short v1_16 = vec_splat_u16(1); 785 vector unsigned short v2_16 = vec_splat_u16(2); 786 vector unsigned short v3_16 = vec_splat_u16(3); 787 vector unsigned int v8_32 = vec_splat_u32(8); 788 vector unsigned int v16_32 = vec_add(v8_32, v8_32); 789 vector unsigned short v3f = VECUINT16_LITERAL( 790 0x003f, 0x003f, 0x003f, 0x003f, 791 0x003f, 0x003f, 0x003f, 0x003f); 792 vector unsigned short vfc = VECUINT16_LITERAL( 793 0x00fc, 0x00fc, 0x00fc, 0x00fc, 794 0x00fc, 0x00fc, 0x00fc, 0x00fc); 795 796 /* 797 0x10 - 0x1f is the alpha 798 0x00 - 0x0e evens are the red 799 0x01 - 0x0f odds are zero 800 */ 801 vector unsigned char vredalpha1 = VECUINT8_LITERAL( 802 0x10, 0x00, 0x01, 0x01, 803 0x10, 0x02, 0x01, 0x01, 804 0x10, 0x04, 0x01, 0x01, 805 0x10, 0x06, 0x01, 0x01 806 ); 807 vector unsigned char vredalpha2 = (vector unsigned char)( 808 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32)) 809 ); 810 /* 811 0x00 - 0x0f is ARxx ARxx ARxx ARxx 812 0x11 - 0x0f odds are blue 813 */ 814 vector unsigned char vblue1 = VECUINT8_LITERAL( 815 0x00, 0x01, 0x02, 0x11, 816 0x04, 0x05, 0x06, 0x13, 817 0x08, 0x09, 0x0a, 0x15, 818 0x0c, 0x0d, 0x0e, 0x17 819 ); 820 vector unsigned char vblue2 = (vector unsigned char)( 821 vec_add((vector unsigned int)vblue1, v8_32) 822 ); 823 /* 824 0x00 - 0x0f is ARxB ARxB ARxB ARxB 825 0x10 - 0x0e evens are green 826 */ 827 vector unsigned char vgreen1 = VECUINT8_LITERAL( 828 0x00, 0x01, 0x10, 0x03, 829 0x04, 0x05, 0x12, 0x07, 830 0x08, 0x09, 0x14, 0x0b, 831 0x0c, 0x0d, 0x16, 0x0f 832 ); 833 vector unsigned char vgreen2 = (vector unsigned char)( 834 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32)) 835 ); 836 vector unsigned char vgmerge = VECUINT8_LITERAL( 837 0x00, 0x02, 0x00, 0x06, 838 0x00, 0x0a, 0x00, 0x0e, 839 0x00, 0x12, 0x00, 0x16, 840 0x00, 0x1a, 0x00, 0x1e); 841 vector unsigned char mergePermute = VEC_MERGE_PERMUTE(); 842 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL); 843 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); 844 845 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7); 846 vf800 = vec_sl(vf800, vec_splat_u16(8)); 847 848 while(height--) { 849 int extrawidth; 850 vector unsigned char valigner; 851 vector unsigned char vsrc; 852 vector unsigned char voverflow; 853 int width = info->d_width; 854 855 #define ONE_PIXEL_BLEND(condition, widthvar) \ 856 while (condition) { \ 857 Uint32 Pixel; \ 858 unsigned sR, sG, sB, dR, dG, dB, sA; \ 859 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \ 860 if(sA) { \ 861 unsigned short dstpixel = *((unsigned short *)dst); \ 862 dR = (dstpixel >> 8) & 0xf8; \ 863 dG = (dstpixel >> 3) & 0xfc; \ 864 dB = (dstpixel << 3) & 0xf8; \ 865 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ 866 *((unsigned short *)dst) = ( \ 867 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \ 868 ); \ 869 } \ 870 src += 4; \ 871 dst += 2; \ 872 widthvar--; \ 873 } 874 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width); 875 extrawidth = (width % 8); 876 valigner = VEC_ALIGNER(src); 877 vsrc = (vector unsigned char)vec_ld(0, src); 878 width -= extrawidth; 879 while (width) { 880 vector unsigned char valpha; 881 vector unsigned char vsrc1, vsrc2; 882 vector unsigned char vdst1, vdst2; 883 vector unsigned short vR, vG, vB; 884 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel; 885 886 /* Load 8 pixels from src as ARGB */ 887 voverflow = (vector unsigned char)vec_ld(15, src); 888 vsrc = vec_perm(vsrc, voverflow, valigner); 889 vsrc1 = vec_perm(vsrc, vsrc, vpermute); 890 src += 16; 891 vsrc = (vector unsigned char)vec_ld(15, src); 892 voverflow = vec_perm(voverflow, vsrc, valigner); 893 vsrc2 = vec_perm(voverflow, voverflow, vpermute); 894 src += 16; 895 896 /* Load 8 pixels from dst as XRGB */ 897 voverflow = vec_ld(0, dst); 898 vR = vec_and((vector unsigned short)voverflow, vf800); 899 vB = vec_sl((vector unsigned short)voverflow, v3_16); 900 vG = vec_sl(vB, v2_16); 901 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1); 902 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1); 903 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1); 904 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2); 905 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2); 906 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2); 907 908 /* Alpha blend 8 pixels as ARGB */ 909 valpha = vec_perm(vsrc1, v0, valphaPermute); 910 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16); 911 valpha = vec_perm(vsrc2, v0, valphaPermute); 912 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16); 913 914 /* Convert 8 pixels to 565 */ 915 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2); 916 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge); 917 vgpixel = vec_and(vgpixel, vfc); 918 vgpixel = vec_sl(vgpixel, v3_16); 919 vrpixel = vec_sl(vpixel, v1_16); 920 vrpixel = vec_and(vrpixel, vf800); 921 vbpixel = vec_and(vpixel, v3f); 922 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel); 923 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel); 924 925 /* Store 8 pixels */ 926 vec_st(vdst1, 0, dst); 927 928 width -= 8; 929 dst += 16; 930 } 931 ONE_PIXEL_BLEND((extrawidth), extrawidth); 932 #undef ONE_PIXEL_BLEND 933 src += srcskip; 934 dst += dstskip; 935 } 936 } 937 938 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info) 939 { 940 unsigned alpha = info->src->alpha; 941 int height = info->d_height; 942 Uint32 *srcp = (Uint32 *)info->s_pixels; 943 int srcskip = info->s_skip >> 2; 944 Uint32 *dstp = (Uint32 *)info->d_pixels; 945 int dstskip = info->d_skip >> 2; 946 SDL_PixelFormat *srcfmt = info->src; 947 SDL_PixelFormat *dstfmt = info->dst; 948 unsigned sA = srcfmt->alpha; 949 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; 950 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask; 951 Uint32 ckey = info->src->colorkey; 952 vector unsigned char mergePermute; 953 vector unsigned char vsrcPermute; 954 vector unsigned char vdstPermute; 955 vector unsigned char vsdstPermute; 956 vector unsigned char valpha; 957 vector unsigned char valphamask; 958 vector unsigned char vbits; 959 vector unsigned char v0; 960 vector unsigned short v1; 961 vector unsigned short v8; 962 vector unsigned int vckey; 963 vector unsigned int vrgbmask; 964 965 mergePermute = VEC_MERGE_PERMUTE(); 966 v0 = vec_splat_u8(0); 967 v1 = vec_splat_u16(1); 968 v8 = vec_splat_u16(8); 969 970 /* set the alpha to 255 on the destination surf */ 971 valphamask = VEC_ALPHA_MASK(); 972 973 vsrcPermute = calc_swizzle32(srcfmt, NULL); 974 vdstPermute = calc_swizzle32(NULL, dstfmt); 975 vsdstPermute = calc_swizzle32(dstfmt, NULL); 976 977 /* set a vector full of alpha and 255-alpha */ 978 ((unsigned char *)&valpha)[0] = alpha; 979 valpha = vec_splat(valpha, 0); 980 vbits = (vector unsigned char)vec_splat_s8(-1); 981 982 ckey &= rgbmask; 983 ((unsigned int *)(char*)&vckey)[0] = ckey; 984 vckey = vec_splat(vckey, 0); 985 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask; 986 vrgbmask = vec_splat(vrgbmask, 0); 987 988 while(height--) { 989 int width = info->d_width; 990 #define ONE_PIXEL_BLEND(condition, widthvar) \ 991 while (condition) { \ 992 Uint32 Pixel; \ 993 unsigned sR, sG, sB, dR, dG, dB; \ 994 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \ 995 if(sA && Pixel != ckey) { \ 996 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \ 997 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \ 998 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ 999 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ 1000 } \ 1001 dstp++; \ 1002 srcp++; \ 1003 widthvar--; \ 1004 } 1005 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); 1006 if (width > 0) { 1007 int extrawidth = (width % 4); 1008 vector unsigned char valigner = VEC_ALIGNER(srcp); 1009 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); 1010 width -= extrawidth; 1011 while (width) { 1012 vector unsigned char vsel; 1013 vector unsigned char voverflow; 1014 vector unsigned char vd; 1015 vector unsigned char vd_orig; 1016 1017 /* s = *srcp */ 1018 voverflow = (vector unsigned char)vec_ld(15, srcp); 1019 vs = vec_perm(vs, voverflow, valigner); 1020 1021 /* vsel is set for items that match the key */ 1022 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask); 1023 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey); 1024 1025 /* permute to source format */ 1026 vs = vec_perm(vs, valpha, vsrcPermute); 1027 1028 /* d = *dstp */ 1029 vd = (vector unsigned char)vec_ld(0, dstp); 1030 vd_orig = vd = vec_perm(vd, v0, vsdstPermute); 1031 1032 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); 1033 1034 /* set the alpha channel to full on */ 1035 vd = vec_or(vd, valphamask); 1036 1037 /* mask out color key */ 1038 vd = vec_sel(vd, vd_orig, vsel); 1039 1040 /* permute to dest format */ 1041 vd = vec_perm(vd, vbits, vdstPermute); 1042 1043 /* *dstp = res */ 1044 vec_st((vector unsigned int)vd, 0, dstp); 1045 1046 srcp += 4; 1047 dstp += 4; 1048 width -= 4; 1049 vs = voverflow; 1050 } 1051 ONE_PIXEL_BLEND((extrawidth), extrawidth); 1052 } 1053 #undef ONE_PIXEL_BLEND 1054 1055 srcp += srcskip; 1056 dstp += dstskip; 1057 } 1058 } 1059 1060 1061 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info) 1062 { 1063 int width = info->d_width; 1064 int height = info->d_height; 1065 Uint32 *srcp = (Uint32 *)info->s_pixels; 1066 int srcskip = info->s_skip >> 2; 1067 Uint32 *dstp = (Uint32 *)info->d_pixels; 1068 int dstskip = info->d_skip >> 2; 1069 SDL_PixelFormat *srcfmt = info->src; 1070 SDL_PixelFormat *dstfmt = info->dst; 1071 vector unsigned char mergePermute; 1072 vector unsigned char valphaPermute; 1073 vector unsigned char vsrcPermute; 1074 vector unsigned char vdstPermute; 1075 vector unsigned char vsdstPermute; 1076 vector unsigned char valphamask; 1077 vector unsigned char vpixelmask; 1078 vector unsigned char v0; 1079 vector unsigned short v1; 1080 vector unsigned short v8; 1081 1082 v0 = vec_splat_u8(0); 1083 v1 = vec_splat_u16(1); 1084 v8 = vec_splat_u16(8); 1085 mergePermute = VEC_MERGE_PERMUTE(); 1086 valphamask = VEC_ALPHA_MASK(); 1087 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); 1088 vpixelmask = vec_nor(valphamask, v0); 1089 vsrcPermute = calc_swizzle32(srcfmt, NULL); 1090 vdstPermute = calc_swizzle32(NULL, dstfmt); 1091 vsdstPermute = calc_swizzle32(dstfmt, NULL); 1092 1093 while ( height-- ) { 1094 width = info->d_width; 1095 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ 1096 Uint32 Pixel; \ 1097 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \ 1098 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \ 1099 if(sA) { \ 1100 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \ 1101 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ 1102 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \ 1103 } \ 1104 ++srcp; \ 1105 ++dstp; \ 1106 widthvar--; \ 1107 } 1108 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); 1109 if (width > 0) { 1110 /* vsrcPermute */ 1111 /* vdstPermute */ 1112 int extrawidth = (width % 4); 1113 vector unsigned char valigner = VEC_ALIGNER(srcp); 1114 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); 1115 width -= extrawidth; 1116 while (width) { 1117 vector unsigned char voverflow; 1118 vector unsigned char vd; 1119 vector unsigned char valpha; 1120 vector unsigned char vdstalpha; 1121 /* s = *srcp */ 1122 voverflow = (vector unsigned char)vec_ld(15, srcp); 1123 vs = vec_perm(vs, voverflow, valigner); 1124 vs = vec_perm(vs, v0, vsrcPermute); 1125 1126 valpha = vec_perm(vs, v0, valphaPermute); 1127 1128 /* d = *dstp */ 1129 vd = (vector unsigned char)vec_ld(0, dstp); 1130 vd = vec_perm(vd, v0, vsdstPermute); 1131 vdstalpha = vec_and(vd, valphamask); 1132 1133 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); 1134 1135 /* set the alpha to the dest alpha */ 1136 vd = vec_and(vd, vpixelmask); 1137 vd = vec_or(vd, vdstalpha); 1138 vd = vec_perm(vd, v0, vdstPermute); 1139 1140 /* *dstp = res */ 1141 vec_st((vector unsigned int)vd, 0, dstp); 1142 1143 srcp += 4; 1144 dstp += 4; 1145 width -= 4; 1146 vs = voverflow; 1147 1148 } 1149 ONE_PIXEL_BLEND((extrawidth), extrawidth); 1150 } 1151 srcp += srcskip; 1152 dstp += dstskip; 1153 #undef ONE_PIXEL_BLEND 1154 } 1155 } 1156 1157 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ 1158 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info) 1159 { 1160 int width = info->d_width; 1161 int height = info->d_height; 1162 Uint32 *srcp = (Uint32 *)info->s_pixels; 1163 int srcskip = info->s_skip >> 2; 1164 Uint32 *dstp = (Uint32 *)info->d_pixels; 1165 int dstskip = info->d_skip >> 2; 1166 vector unsigned char mergePermute; 1167 vector unsigned char valphaPermute; 1168 vector unsigned char valphamask; 1169 vector unsigned char vpixelmask; 1170 vector unsigned char v0; 1171 vector unsigned short v1; 1172 vector unsigned short v8; 1173 v0 = vec_splat_u8(0); 1174 v1 = vec_splat_u16(1); 1175 v8 = vec_splat_u16(8); 1176 mergePermute = VEC_MERGE_PERMUTE(); 1177 valphamask = VEC_ALPHA_MASK(); 1178 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); 1179 1180 1181 vpixelmask = vec_nor(valphamask, v0); 1182 while(height--) { 1183 width = info->d_width; 1184 #define ONE_PIXEL_BLEND(condition, widthvar) \ 1185 while ((condition)) { \ 1186 Uint32 dalpha; \ 1187 Uint32 d; \ 1188 Uint32 s1; \ 1189 Uint32 d1; \ 1190 Uint32 s = *srcp; \ 1191 Uint32 alpha = s >> 24; \ 1192 if(alpha) { \ 1193 if(alpha == SDL_ALPHA_OPAQUE) { \ 1194 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \ 1195 } else { \ 1196 d = *dstp; \ 1197 dalpha = d & 0xff000000; \ 1198 s1 = s & 0xff00ff; \ 1199 d1 = d & 0xff00ff; \ 1200 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \ 1201 s &= 0xff00; \ 1202 d &= 0xff00; \ 1203 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ 1204 *dstp = d1 | d | dalpha; \ 1205 } \ 1206 } \ 1207 ++srcp; \ 1208 ++dstp; \ 1209 widthvar--; \ 1210 } 1211 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); 1212 if (width > 0) { 1213 int extrawidth = (width % 4); 1214 vector unsigned char valigner = VEC_ALIGNER(srcp); 1215 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); 1216 width -= extrawidth; 1217 while (width) { 1218 vector unsigned char voverflow; 1219 vector unsigned char vd; 1220 vector unsigned char valpha; 1221 vector unsigned char vdstalpha; 1222 /* s = *srcp */ 1223 voverflow = (vector unsigned char)vec_ld(15, srcp); 1224 vs = vec_perm(vs, voverflow, valigner); 1225 1226 valpha = vec_perm(vs, v0, valphaPermute); 1227 1228 /* d = *dstp */ 1229 vd = (vector unsigned char)vec_ld(0, dstp); 1230 vdstalpha = vec_and(vd, valphamask); 1231 1232 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); 1233 1234 /* set the alpha to the dest alpha */ 1235 vd = vec_and(vd, vpixelmask); 1236 vd = vec_or(vd, vdstalpha); 1237 1238 /* *dstp = res */ 1239 vec_st((vector unsigned int)vd, 0, dstp); 1240 1241 srcp += 4; 1242 dstp += 4; 1243 width -= 4; 1244 vs = voverflow; 1245 } 1246 ONE_PIXEL_BLEND((extrawidth), extrawidth); 1247 } 1248 srcp += srcskip; 1249 dstp += dstskip; 1250 } 1251 #undef ONE_PIXEL_BLEND 1252 } 1253 1254 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info) 1255 { 1256 /* XXX : 6 */ 1257 unsigned alpha = info->src->alpha; 1258 int height = info->d_height; 1259 Uint32 *srcp = (Uint32 *)info->s_pixels; 1260 int srcskip = info->s_skip >> 2; 1261 Uint32 *dstp = (Uint32 *)info->d_pixels; 1262 int dstskip = info->d_skip >> 2; 1263 SDL_PixelFormat *srcfmt = info->src; 1264 SDL_PixelFormat *dstfmt = info->dst; 1265 unsigned sA = srcfmt->alpha; 1266 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; 1267 vector unsigned char mergePermute; 1268 vector unsigned char vsrcPermute; 1269 vector unsigned char vdstPermute; 1270 vector unsigned char vsdstPermute; 1271 vector unsigned char valpha; 1272 vector unsigned char valphamask; 1273 vector unsigned char vbits; 1274 vector unsigned short v1; 1275 vector unsigned short v8; 1276 1277 mergePermute = VEC_MERGE_PERMUTE(); 1278 v1 = vec_splat_u16(1); 1279 v8 = vec_splat_u16(8); 1280 1281 /* set the alpha to 255 on the destination surf */ 1282 valphamask = VEC_ALPHA_MASK(); 1283 1284 vsrcPermute = calc_swizzle32(srcfmt, NULL); 1285 vdstPermute = calc_swizzle32(NULL, dstfmt); 1286 vsdstPermute = calc_swizzle32(dstfmt, NULL); 1287 1288 /* set a vector full of alpha and 255-alpha */ 1289 ((unsigned char *)&valpha)[0] = alpha; 1290 valpha = vec_splat(valpha, 0); 1291 vbits = (vector unsigned char)vec_splat_s8(-1); 1292 1293 while(height--) { 1294 int width = info->d_width; 1295 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ 1296 Uint32 Pixel; \ 1297 unsigned sR, sG, sB, dR, dG, dB; \ 1298 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \ 1299 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \ 1300 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ 1301 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ 1302 ++srcp; \ 1303 ++dstp; \ 1304 widthvar--; \ 1305 } 1306 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); 1307 if (width > 0) { 1308 int extrawidth = (width % 4); 1309 vector unsigned char valigner = VEC_ALIGNER(srcp); 1310 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); 1311 width -= extrawidth; 1312 while (width) { 1313 vector unsigned char voverflow; 1314 vector unsigned char vd; 1315 1316 /* s = *srcp */ 1317 voverflow = (vector unsigned char)vec_ld(15, srcp); 1318 vs = vec_perm(vs, voverflow, valigner); 1319 vs = vec_perm(vs, valpha, vsrcPermute); 1320 1321 /* d = *dstp */ 1322 vd = (vector unsigned char)vec_ld(0, dstp); 1323 vd = vec_perm(vd, vd, vsdstPermute); 1324 1325 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); 1326 1327 /* set the alpha channel to full on */ 1328 vd = vec_or(vd, valphamask); 1329 vd = vec_perm(vd, vbits, vdstPermute); 1330 1331 /* *dstp = res */ 1332 vec_st((vector unsigned int)vd, 0, dstp); 1333 1334 srcp += 4; 1335 dstp += 4; 1336 width -= 4; 1337 vs = voverflow; 1338 } 1339 ONE_PIXEL_BLEND((extrawidth), extrawidth); 1340 } 1341 #undef ONE_PIXEL_BLEND 1342 1343 srcp += srcskip; 1344 dstp += dstskip; 1345 } 1346 1347 } 1348 1349 1350 /* fast RGB888->(A)RGB888 blending */ 1351 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info) 1352 { 1353 unsigned alpha = info->src->alpha; 1354 int height = info->d_height; 1355 Uint32 *srcp = (Uint32 *)info->s_pixels; 1356 int srcskip = info->s_skip >> 2; 1357 Uint32 *dstp = (Uint32 *)info->d_pixels; 1358 int dstskip = info->d_skip >> 2; 1359 vector unsigned char mergePermute; 1360 vector unsigned char valpha; 1361 vector unsigned char valphamask; 1362 vector unsigned short v1; 1363 vector unsigned short v8; 1364 1365 mergePermute = VEC_MERGE_PERMUTE(); 1366 v1 = vec_splat_u16(1); 1367 v8 = vec_splat_u16(8); 1368 1369 /* set the alpha to 255 on the destination surf */ 1370 valphamask = VEC_ALPHA_MASK(); 1371 1372 /* set a vector full of alpha and 255-alpha */ 1373 ((unsigned char *)&valpha)[0] = alpha; 1374 valpha = vec_splat(valpha, 0); 1375 1376 while(height--) { 1377 int width = info->d_width; 1378 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ 1379 Uint32 s = *srcp; \ 1380 Uint32 d = *dstp; \ 1381 Uint32 s1 = s & 0xff00ff; \ 1382 Uint32 d1 = d & 0xff00ff; \ 1383 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \ 1384 & 0xff00ff; \ 1385 s &= 0xff00; \ 1386 d &= 0xff00; \ 1387 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ 1388 *dstp = d1 | d | 0xff000000; \ 1389 ++srcp; \ 1390 ++dstp; \ 1391 widthvar--; \ 1392 } 1393 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); 1394 if (width > 0) { 1395 int extrawidth = (width % 4); 1396 vector unsigned char valigner = VEC_ALIGNER(srcp); 1397 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); 1398 width -= extrawidth; 1399 while (width) { 1400 vector unsigned char voverflow; 1401 vector unsigned char vd; 1402 1403 /* s = *srcp */ 1404 voverflow = (vector unsigned char)vec_ld(15, srcp); 1405 vs = vec_perm(vs, voverflow, valigner); 1406 1407 /* d = *dstp */ 1408 vd = (vector unsigned char)vec_ld(0, dstp); 1409 1410 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); 1411 1412 /* set the alpha channel to full on */ 1413 vd = vec_or(vd, valphamask); 1414 1415 /* *dstp = res */ 1416 vec_st((vector unsigned int)vd, 0, dstp); 1417 1418 srcp += 4; 1419 dstp += 4; 1420 width -= 4; 1421 vs = voverflow; 1422 } 1423 ONE_PIXEL_BLEND((extrawidth), extrawidth); 1424 } 1425 #undef ONE_PIXEL_BLEND 1426 1427 srcp += srcskip; 1428 dstp += dstskip; 1429 } 1430 } 1431 #if __MWERKS__ 1432 #pragma altivec_model off 1433 #endif 1434 #endif /* SDL_ALTIVEC_BLITTERS */ 1435 1436 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 1437 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) 1438 { 1439 int width = info->d_width; 1440 int height = info->d_height; 1441 Uint32 *srcp = (Uint32 *)info->s_pixels; 1442 int srcskip = info->s_skip >> 2; 1443 Uint32 *dstp = (Uint32 *)info->d_pixels; 1444 int dstskip = info->d_skip >> 2; 1445 1446 while(height--) { 1447 DUFFS_LOOP4({ 1448 Uint32 s = *srcp++; 1449 Uint32 d = *dstp; 1450 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) 1451 + (s & d & 0x00010101)) | 0xff000000; 1452 }, width); 1453 srcp += srcskip; 1454 dstp += dstskip; 1455 } 1456 } 1457 1458 /* fast RGB888->(A)RGB888 blending with surface alpha */ 1459 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info) 1460 { 1461 unsigned alpha = info->src->alpha; 1462 if(alpha == 128) { 1463 BlitRGBtoRGBSurfaceAlpha128(info); 1464 } else { 1465 int width = info->d_width; 1466 int height = info->d_height; 1467 Uint32 *srcp = (Uint32 *)info->s_pixels; 1468 int srcskip = info->s_skip >> 2; 1469 Uint32 *dstp = (Uint32 *)info->d_pixels; 1470 int dstskip = info->d_skip >> 2; 1471 Uint32 s; 1472 Uint32 d; 1473 Uint32 s1; 1474 Uint32 d1; 1475 1476 while(height--) { 1477 DUFFS_LOOP_DOUBLE2({ 1478 /* One Pixel Blend */ 1479 s = *srcp; 1480 d = *dstp; 1481 s1 = s & 0xff00ff; 1482 d1 = d & 0xff00ff; 1483 d1 = (d1 + ((s1 - d1) * alpha >> 8)) 1484 & 0xff00ff; 1485 s &= 0xff00; 1486 d &= 0xff00; 1487 d = (d + ((s - d) * alpha >> 8)) & 0xff00; 1488 *dstp = d1 | d | 0xff000000; 1489 ++srcp; 1490 ++dstp; 1491 },{ 1492 /* Two Pixels Blend */ 1493 s = *srcp; 1494 d = *dstp; 1495 s1 = s & 0xff00ff; 1496 d1 = d & 0xff00ff; 1497 d1 += (s1 - d1) * alpha >> 8; 1498 d1 &= 0xff00ff; 1499 1500 s = ((s & 0xff00) >> 8) | 1501 ((srcp[1] & 0xff00) << 8); 1502 d = ((d & 0xff00) >> 8) | 1503 ((dstp[1] & 0xff00) << 8); 1504 d += (s - d) * alpha >> 8; 1505 d &= 0x00ff00ff; 1506 1507 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000; 1508 ++srcp; 1509 1510 s1 = *srcp; 1511 d1 = *dstp; 1512 s1 &= 0xff00ff; 1513 d1 &= 0xff00ff; 1514 d1 += (s1 - d1) * alpha >> 8; 1515 d1 &= 0xff00ff; 1516 1517 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000; 1518 ++srcp; 1519 ++dstp; 1520 }, width); 1521 srcp += srcskip; 1522 dstp += dstskip; 1523 } 1524 } 1525 } 1526 1527 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ 1528 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info) 1529 { 1530 int width = info->d_width; 1531 int height = info->d_height; 1532 Uint32 *srcp = (Uint32 *)info->s_pixels; 1533 int srcskip = info->s_skip >> 2; 1534 Uint32 *dstp = (Uint32 *)info->d_pixels; 1535 int dstskip = info->d_skip >> 2; 1536 1537 while(height--) { 1538 DUFFS_LOOP4({ 1539 Uint32 dalpha; 1540 Uint32 d; 1541 Uint32 s1; 1542 Uint32 d1; 1543 Uint32 s = *srcp; 1544 Uint32 alpha = s >> 24; 1545 /* FIXME: Here we special-case opaque alpha since the 1546 compositioning used (>>8 instead of /255) doesn't handle 1547 it correctly. Also special-case alpha=0 for speed? 1548 Benchmark this! */ 1549 if(alpha) { 1550 if(alpha == SDL_ALPHA_OPAQUE) { 1551 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); 1552 } else { 1553 /* 1554 * take out the middle component (green), and process 1555 * the other two in parallel. One multiply less. 1556 */ 1557 d = *dstp; 1558 dalpha = d & 0xff000000; 1559 s1 = s & 0xff00ff; 1560 d1 = d & 0xff00ff; 1561 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; 1562 s &= 0xff00; 1563 d &= 0xff00; 1564 d = (d + ((s - d) * alpha >> 8)) & 0xff00; 1565 *dstp = d1 | d | dalpha; 1566 } 1567 } 1568 ++srcp; 1569 ++dstp; 1570 }, width); 1571 srcp += srcskip; 1572 dstp += dstskip; 1573 } 1574 } 1575 1576 #if GCC_ASMBLIT 1577 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ 1578 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) 1579 { 1580 int width = info->d_width; 1581 int height = info->d_height; 1582 Uint32 *srcp = (Uint32 *)info->s_pixels; 1583 int srcskip = info->s_skip >> 2; 1584 Uint32 *dstp = (Uint32 *)info->d_pixels; 1585 int dstskip = info->d_skip >> 2; 1586 SDL_PixelFormat* sf = info->src; 1587 Uint32 amask = sf->Amask; 1588 1589 __asm__ ( 1590 /* make mm6 all zeros. */ 1591 "pxor %%mm6, %%mm6\n" 1592 1593 /* Make a mask to preserve the alpha. */ 1594 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */ 1595 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */ 1596 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */ 1597 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */ 1598 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */ 1599 1600 /* form channel masks */ 1601 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */ 1602 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */ 1603 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */ 1604 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */ 1605 1606 /* get alpha channel shift */ 1607 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */ 1608 1609 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) ); 1610 1611 while(height--) { 1612 1613 DUFFS_LOOP4({ 1614 Uint32 alpha; 1615 1616 __asm__ ( 1617 "prefetch 64(%0)\n" 1618 "prefetch 64(%1)\n" 1619 : : "r" (srcp), "r" (dstp) ); 1620 1621 alpha = *srcp & amask; 1622 /* FIXME: Here we special-case opaque alpha since the 1623 compositioning used (>>8 instead of /255) doesn't handle 1624 it correctly. Also special-case alpha=0 for speed? 1625 Benchmark this! */ 1626 if(alpha == 0) { 1627 /* do nothing */ 1628 } 1629 else if(alpha == amask) { 1630 /* opaque alpha -- copy RGB, keep dst alpha */ 1631 /* using MMX here to free up regular registers for other things */ 1632 __asm__ ( 1633 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/ 1634 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/ 1635 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */ 1636 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */ 1637 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */ 1638 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */ 1639 1640 : : "r" (srcp), "r" (dstp) ); 1641 } 1642 1643 else { 1644 __asm__ ( 1645 /* load in the source, and dst. */ 1646 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */ 1647 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */ 1648 1649 /* Move the src alpha into mm2 */ 1650 1651 /* if supporting pshufw */ 1652 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */ 1653 /*"psrlw $8, %%mm2\n" */ 1654 1655 /* else: */ 1656 "movd %2, %%mm2\n" 1657 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */ 1658 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */ 1659 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */ 1660 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */ 1661 1662 /* move the colors into words. */ 1663 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */ 1664 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */ 1665 1666 /* src - dst */ 1667 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */ 1668 1669 /* A * (src-dst) */ 1670 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */ 1671 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */ 1672 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */ 1673 1674 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */ 1675 1676 "movd %%mm0, (%1)\n" /* result in mm0 */ 1677 1678 : : "r" (srcp), "r" (dstp), "r" (alpha) ); 1679 1680 } 1681 ++srcp; 1682 ++dstp; 1683 }, width); 1684 srcp += srcskip; 1685 dstp += dstskip; 1686 } 1687 1688 __asm__ ( 1689 "emms\n" 1690 : ); 1691 } 1692 /* End GCC_ASMBLIT*/ 1693 1694 #elif MSVC_ASMBLIT 1695 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ 1696 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) 1697 { 1698 int width = info->d_width; 1699 int height = info->d_height; 1700 Uint32 *srcp = (Uint32 *)info->s_pixels; 1701 int srcskip = info->s_skip >> 2; 1702 Uint32 *dstp = (Uint32 *)info->d_pixels; 1703 int dstskip = info->d_skip >> 2; 1704 SDL_PixelFormat* sf = info->src; 1705 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; 1706 Uint32 amask = sf->Amask; 1707 Uint32 ashift = sf->Ashift; 1708 Uint64 multmask; 1709 1710 __m64 src1, dst1, mm_alpha, mm_zero, dmask; 1711 1712 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 1713 multmask = ~(0xFFFFi64 << (ashift * 2)); 1714 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */ 1715 1716 while(height--) { 1717 DUFFS_LOOP4({ 1718 Uint32 alpha; 1719 1720 _m_prefetch(srcp + 16); 1721 _m_prefetch(dstp + 16); 1722 1723 alpha = *srcp & amask; 1724 if (alpha == 0) { 1725 /* do nothing */ 1726 } else if (alpha == amask) { 1727 /* copy RGB, keep dst alpha */ 1728 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); 1729 } else { 1730 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ 1731 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ 1732 1733 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ 1734 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ 1735 1736 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ 1737 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ 1738 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 1739 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ 1740 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ 1741 1742 /* blend */ 1743 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */ 1744 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */ 1745 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ 1746 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */ 1747 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ 1748 1749 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ 1750 } 1751 ++srcp; 1752 ++dstp; 1753 }, width); 1754 srcp += srcskip; 1755 dstp += dstskip; 1756 } 1757 _mm_empty(); 1758 } 1759 /* End MSVC_ASMBLIT */ 1760 1761 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ 1762 1763 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ 1764 1765 /* blend a single 16 bit pixel at 50% */ 1766 #define BLEND16_50(d, s, mask) \ 1767 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff))) 1768 1769 /* blend two 16 bit pixels at 50% */ 1770 #define BLEND2x16_50(d, s, mask) \ 1771 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \ 1772 + (s & d & (~(mask | mask << 16)))) 1773 1774 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask) 1775 { 1776 int width = info->d_width; 1777 int height = info->d_height; 1778 Uint16 *srcp = (Uint16 *)info->s_pixels; 1779 int srcskip = info->s_skip >> 1; 1780 Uint16 *dstp = (Uint16 *)info->d_pixels; 1781 int dstskip = info->d_skip >> 1; 1782 1783 while(height--) { 1784 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) { 1785 /* 1786 * Source and destination not aligned, pipeline it. 1787 * This is mostly a win for big blits but no loss for 1788 * small ones 1789 */ 1790 Uint32 prev_sw; 1791 int w = width; 1792 1793 /* handle odd destination */ 1794 if((uintptr_t)dstp & 2) { 1795 Uint16 d = *dstp, s = *srcp; 1796 *dstp = BLEND16_50(d, s, mask); 1797 dstp++; 1798 srcp++; 1799 w--; 1800 } 1801 srcp++; /* srcp is now 32-bit aligned */ 1802 1803 /* bootstrap pipeline with first halfword */ 1804 prev_sw = ((Uint32 *)srcp)[-1]; 1805 1806 while(w > 1) { 1807 Uint32 sw, dw, s; 1808 sw = *(Uint32 *)srcp; 1809 dw = *(Uint32 *)dstp; 1810 #if SDL_BYTEORDER == SDL_BIG_ENDIAN 1811 s = (prev_sw << 16) + (sw >> 16); 1812 #else 1813 s = (prev_sw >> 16) + (sw << 16); 1814 #endif 1815 prev_sw = sw; 1816 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask); 1817 dstp += 2; 1818 srcp += 2; 1819 w -= 2; 1820 } 1821 1822 /* final pixel if any */ 1823 if(w) { 1824 Uint16 d = *dstp, s; 1825 #if SDL_BYTEORDER == SDL_BIG_ENDIAN 1826 s = (Uint16)prev_sw; 1827 #else 1828 s = (Uint16)(prev_sw >> 16); 1829 #endif 1830 *dstp = BLEND16_50(d, s, mask); 1831 srcp++; 1832 dstp++; 1833 } 1834 srcp += srcskip - 1; 1835 dstp += dstskip; 1836 } else { 1837 /* source and destination are aligned */ 1838 int w = width; 1839 1840 /* first odd pixel? */ 1841 if((uintptr_t)srcp & 2) { 1842 Uint16 d = *dstp, s = *srcp; 1843 *dstp = BLEND16_50(d, s, mask); 1844 srcp++; 1845 dstp++; 1846 w--; 1847 } 1848 /* srcp and dstp are now 32-bit aligned */ 1849 1850 while(w > 1) { 1851 Uint32 sw = *(Uint32 *)srcp; 1852 Uint32 dw = *(Uint32 *)dstp; 1853 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask); 1854 srcp += 2; 1855 dstp += 2; 1856 w -= 2; 1857 } 1858 1859 /* last odd pixel? */ 1860 if(w) { 1861 Uint16 d = *dstp, s = *srcp; 1862 *dstp = BLEND16_50(d, s, mask); 1863 srcp++; 1864 dstp++; 1865 } 1866 srcp += srcskip; 1867 dstp += dstskip; 1868 } 1869 } 1870 } 1871 1872 #if GCC_ASMBLIT 1873 /* fast RGB565->RGB565 blending with surface alpha */ 1874 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) 1875 { 1876 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ 1877 if(alpha == 128) { 1878 Blit16to16SurfaceAlpha128(info, 0xf7de); 1879 } else { 1880 int width = info->d_width; 1881 int height = info->d_height; 1882 Uint16 *srcp = (Uint16 *)info->s_pixels; 1883 int srcskip = info->s_skip >> 1; 1884 Uint16 *dstp = (Uint16 *)info->d_pixels; 1885 int dstskip = info->d_skip >> 1; 1886 Uint32 s, d; 1887 Uint64 load; 1888 1889 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ 1890 load = alpha; 1891 alpha >>= 3; /* downscale alpha to 5 bits */ 1892 1893 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */ 1894 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ 1895 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ 1896 /* position alpha to allow for mullo and mulhi on diff channels 1897 to reduce the number of operations */ 1898 psllq_i2r(3, mm0); 1899 1900 /* Setup the 565 color channel masks */ 1901 load = 0x07E007E007E007E0ULL; 1902 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */ 1903 load = 0x001F001F001F001FULL; 1904 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */ 1905 while(height--) { 1906 DUFFS_LOOP_QUATRO2( 1907 { 1908 s = *srcp++; 1909 d = *dstp; 1910 /* 1911 * shift out the middle component (green) to 1912 * the high 16 bits, and process all three RGB 1913 * components at the same time. 1914 */ 1915 s = (s | s << 16) & 0x07e0f81f; 1916 d = (d | d << 16) & 0x07e0f81f; 1917 d += (s - d) * alpha >> 5; 1918 d &= 0x07e0f81f; 1919 *dstp++ = d | d >> 16; 1920 },{ 1921 s = *srcp++; 1922 d = *dstp; 1923 /* 1924 * shift out the middle component (green) to 1925 * the high 16 bits, and process all three RGB 1926 * components at the same time. 1927 */ 1928 s = (s | s << 16) & 0x07e0f81f; 1929 d = (d | d << 16) & 0x07e0f81f; 1930 d += (s - d) * alpha >> 5; 1931 d &= 0x07e0f81f; 1932 *dstp++ = d | d >> 16; 1933 s = *srcp++; 1934 d = *dstp; 1935 /* 1936 * shift out the middle component (green) to 1937 * the high 16 bits, and process all three RGB 1938 * components at the same time. 1939 */ 1940 s = (s | s << 16) & 0x07e0f81f; 1941 d = (d | d << 16) & 0x07e0f81f; 1942 d += (s - d) * alpha >> 5; 1943 d &= 0x07e0f81f; 1944 *dstp++ = d | d >> 16; 1945 },{ 1946 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ 1947 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ 1948 1949 /* red -- does not need a mask since the right shift clears 1950 the uninteresting bits */ 1951 movq_r2r(mm2, mm5); /* src -> mm5 */ 1952 movq_r2r(mm3, mm6); /* dst -> mm6 */ 1953 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */ 1954 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */ 1955 1956 /* blend */ 1957 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 1958 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 1959 /* alpha used is actually 11 bits 1960 11 + 5 = 16 bits, so the sign bits are lost */ 1961 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ 1962 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 1963 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */ 1964 1965 movq_r2r(mm6, mm1); /* save new reds in dsts */ 1966 1967 /* green -- process the bits in place */ 1968 movq_r2r(mm2, mm5); /* src -> mm5 */ 1969 movq_r2r(mm3, mm6); /* dst -> mm6 */ 1970 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ 1971 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ 1972 1973 /* blend */ 1974 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 1975 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 1976 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting 1977 bits are gone and the sign bits present */ 1978 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ 1979 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 1980 1981 por_r2r(mm6, mm1); /* save new greens in dsts */ 1982 1983 /* blue */ 1984 movq_r2r(mm2, mm5); /* src -> mm5 */ 1985 movq_r2r(mm3, mm6); /* dst -> mm6 */ 1986 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ 1987 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ 1988 1989 /* blend */ 1990 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 1991 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 1992 /* 11 + 5 = 16 bits, so the sign bits are lost and 1993 the interesting bits will need to be MASKed */ 1994 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ 1995 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 1996 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ 1997 1998 por_r2r(mm6, mm1); /* save new blues in dsts */ 1999 2000 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */ 2001 2002 srcp += 4; 2003 dstp += 4; 2004 }, width); 2005 srcp += srcskip; 2006 dstp += dstskip; 2007 } 2008 emms(); 2009 } 2010 } 2011 2012 /* fast RGB555->RGB555 blending with surface alpha */ 2013 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) 2014 { 2015 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ 2016 if(alpha == 128) { 2017 Blit16to16SurfaceAlpha128(info, 0xfbde); 2018 } else { 2019 int width = info->d_width; 2020 int height = info->d_height; 2021 Uint16 *srcp = (Uint16 *)info->s_pixels; 2022 int srcskip = info->s_skip >> 1; 2023 Uint16 *dstp = (Uint16 *)info->d_pixels; 2024 int dstskip = info->d_skip >> 1; 2025 Uint32 s, d; 2026 Uint64 load; 2027 2028 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ 2029 load = alpha; 2030 alpha >>= 3; /* downscale alpha to 5 bits */ 2031 2032 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */ 2033 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ 2034 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ 2035 /* position alpha to allow for mullo and mulhi on diff channels 2036 to reduce the number of operations */ 2037 psllq_i2r(3, mm0); 2038 2039 /* Setup the 555 color channel masks */ 2040 load = 0x03E003E003E003E0ULL; 2041 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */ 2042 load = 0x001F001F001F001FULL; 2043 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */ 2044 while(height--) { 2045 DUFFS_LOOP_QUATRO2( 2046 { 2047 s = *srcp++; 2048 d = *dstp; 2049 /* 2050 * shift out the middle component (green) to 2051 * the high 16 bits, and process all three RGB 2052 * components at the same time. 2053 */ 2054 s = (s | s << 16) & 0x03e07c1f; 2055 d = (d | d << 16) & 0x03e07c1f; 2056 d += (s - d) * alpha >> 5; 2057 d &= 0x03e07c1f; 2058 *dstp++ = d | d >> 16; 2059 },{ 2060 s = *srcp++; 2061 d = *dstp; 2062 /* 2063 * shift out the middle component (green) to 2064 * the high 16 bits, and process all three RGB 2065 * components at the same time. 2066 */ 2067 s = (s | s << 16) & 0x03e07c1f; 2068 d = (d | d << 16) & 0x03e07c1f; 2069 d += (s - d) * alpha >> 5; 2070 d &= 0x03e07c1f; 2071 *dstp++ = d | d >> 16; 2072 s = *srcp++; 2073 d = *dstp; 2074 /* 2075 * shift out the middle component (green) to 2076 * the high 16 bits, and process all three RGB 2077 * components at the same time. 2078 */ 2079 s = (s | s << 16) & 0x03e07c1f; 2080 d = (d | d << 16) & 0x03e07c1f; 2081 d += (s - d) * alpha >> 5; 2082 d &= 0x03e07c1f; 2083 *dstp++ = d | d >> 16; 2084 },{ 2085 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ 2086 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ 2087 2088 /* red -- process the bits in place */ 2089 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */ 2090 /* by reusing the GREEN mask we free up another mmx 2091 register to accumulate the result */ 2092 2093 movq_r2r(mm2, mm5); /* src -> mm5 */ 2094 movq_r2r(mm3, mm6); /* dst -> mm6 */ 2095 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */ 2096 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */ 2097 2098 /* blend */ 2099 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 2100 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 2101 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be 2102 cleared by a MASK below */ 2103 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ 2104 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 2105 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */ 2106 2107 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */ 2108 2109 movq_r2r(mm6, mm1); /* save new reds in dsts */ 2110 2111 /* green -- process the bits in place */ 2112 movq_r2r(mm2, mm5); /* src -> mm5 */ 2113 movq_r2r(mm3, mm6); /* dst -> mm6 */ 2114 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ 2115 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ 2116 2117 /* blend */ 2118 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 2119 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 2120 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting 2121 bits are gone and the sign bits present */ 2122 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ 2123 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 2124 2125 por_r2r(mm6, mm1); /* save new greens in dsts */ 2126 2127 /* blue */ 2128 movq_r2r(mm2, mm5); /* src -> mm5 */ 2129 movq_r2r(mm3, mm6); /* dst -> mm6 */ 2130 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ 2131 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ 2132 2133 /* blend */ 2134 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 2135 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 2136 /* 11 + 5 = 16 bits, so the sign bits are lost and 2137 the interesting bits will need to be MASKed */ 2138 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ 2139 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 2140 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ 2141 2142 por_r2r(mm6, mm1); /* save new blues in dsts */ 2143 2144 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */ 2145 2146 srcp += 4; 2147 dstp += 4; 2148 }, width); 2149 srcp += srcskip; 2150 dstp += dstskip; 2151 } 2152 emms(); 2153 } 2154 } 2155 /* End GCC_ASMBLIT */ 2156 2157 #elif MSVC_ASMBLIT 2158 /* fast RGB565->RGB565 blending with surface alpha */ 2159 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) 2160 { 2161 unsigned alpha = info->src->alpha; 2162 if(alpha == 128) { 2163 Blit16to16SurfaceAlpha128(info, 0xf7de); 2164 } else { 2165 int width = info->d_width; 2166 int height = info->d_height; 2167 Uint16 *srcp = (Uint16 *)info->s_pixels; 2168 int srcskip = info->s_skip >> 1; 2169 Uint16 *dstp = (Uint16 *)info->d_pixels; 2170 int dstskip = info->d_skip >> 1; 2171 Uint32 s, d; 2172 2173 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha; 2174 2175 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ 2176 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ 2177 alpha >>= 3; /* downscale alpha to 5 bits */ 2178 2179 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 2180 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ 2181 /* position alpha to allow for mullo and mulhi on diff channels 2182 to reduce the number of operations */ 2183 mm_alpha = _mm_slli_si64(mm_alpha, 3); 2184 2185 /* Setup the 565 color channel masks */ 2186 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */ 2187 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ 2188 2189 while(height--) { 2190 DUFFS_LOOP_QUATRO2( 2191 { 2192 s = *srcp++; 2193 d = *dstp; 2194 /* 2195 * shift out the middle component (green) to 2196 * the high 16 bits, and process all three RGB 2197 * components at the same time. 2198 */ 2199 s = (s | s << 16) & 0x07e0f81f; 2200 d = (d | d << 16) & 0x07e0f81f; 2201 d += (s - d) * alpha >> 5; 2202 d &= 0x07e0f81f; 2203 *dstp++ = (Uint16)(d | d >> 16); 2204 },{ 2205 s = *srcp++; 2206 d = *dstp; 2207 /* 2208 * shift out the middle component (green) to 2209 * the high 16 bits, and process all three RGB 2210 * components at the same time. 2211 */ 2212 s = (s | s << 16) & 0x07e0f81f; 2213 d = (d | d << 16) & 0x07e0f81f; 2214 d += (s - d) * alpha >> 5; 2215 d &= 0x07e0f81f; 2216 *dstp++ = (Uint16)(d | d >> 16); 2217 s = *srcp++; 2218 d = *dstp; 2219 /* 2220 * shift out the middle component (green) to 2221 * the high 16 bits, and process all three RGB 2222 * components at the same time. 2223 */ 2224 s = (s | s << 16) & 0x07e0f81f; 2225 d = (d | d << 16) & 0x07e0f81f; 2226 d += (s - d) * alpha >> 5; 2227 d &= 0x07e0f81f; 2228 *dstp++ = (Uint16)(d | d >> 16); 2229 },{ 2230 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ 2231 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ 2232 2233 /* red */ 2234 src2 = src1; 2235 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */ 2236 2237 dst2 = dst1; 2238 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */ 2239 2240 /* blend */ 2241 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2242 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2243 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ 2244 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2245 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */ 2246 2247 mm_res = dst2; /* RED -> mm_res */ 2248 2249 /* green -- process the bits in place */ 2250 src2 = src1; 2251 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ 2252 2253 dst2 = dst1; 2254 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ 2255 2256 /* blend */ 2257 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2258 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2259 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ 2260 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2261 2262 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ 2263 2264 /* blue */ 2265 src2 = src1; 2266 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ 2267 2268 dst2 = dst1; 2269 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ 2270 2271 /* blend */ 2272 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2273 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2274 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ 2275 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2276 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ 2277 2278 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ 2279 2280 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ 2281 2282 srcp += 4; 2283 dstp += 4; 2284 }, width); 2285 srcp += srcskip; 2286 dstp += dstskip; 2287 } 2288 _mm_empty(); 2289 } 2290 } 2291 2292 /* fast RGB555->RGB555 blending with surface alpha */ 2293 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) 2294 { 2295 unsigned alpha = info->src->alpha; 2296 if(alpha == 128) { 2297 Blit16to16SurfaceAlpha128(info, 0xfbde); 2298 } else { 2299 int width = info->d_width; 2300 int height = info->d_height; 2301 Uint16 *srcp = (Uint16 *)info->s_pixels; 2302 int srcskip = info->s_skip >> 1; 2303 Uint16 *dstp = (Uint16 *)info->d_pixels; 2304 int dstskip = info->d_skip >> 1; 2305 Uint32 s, d; 2306 2307 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha; 2308 2309 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ 2310 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ 2311 alpha >>= 3; /* downscale alpha to 5 bits */ 2312 2313 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 2314 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ 2315 /* position alpha to allow for mullo and mulhi on diff channels 2316 to reduce the number of operations */ 2317 mm_alpha = _mm_slli_si64(mm_alpha, 3); 2318 2319 /* Setup the 555 color channel masks */ 2320 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */ 2321 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */ 2322 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ 2323 2324 while(height--) { 2325 DUFFS_LOOP_QUATRO2( 2326 { 2327 s = *srcp++; 2328 d = *dstp; 2329 /* 2330 * shift out the middle component (green) to 2331 * the high 16 bits, and process all three RGB 2332 * components at the same time. 2333 */ 2334 s = (s | s << 16) & 0x03e07c1f; 2335 d = (d | d << 16) & 0x03e07c1f; 2336 d += (s - d) * alpha >> 5; 2337 d &= 0x03e07c1f; 2338 *dstp++ = (Uint16)(d | d >> 16); 2339 },{ 2340 s = *srcp++; 2341 d = *dstp; 2342 /* 2343 * shift out the middle component (green) to 2344 * the high 16 bits, and process all three RGB 2345 * components at the same time. 2346 */ 2347 s = (s | s << 16) & 0x03e07c1f; 2348 d = (d | d << 16) & 0x03e07c1f; 2349 d += (s - d) * alpha >> 5; 2350 d &= 0x03e07c1f; 2351 *dstp++ = (Uint16)(d | d >> 16); 2352 s = *srcp++; 2353 d = *dstp; 2354 /* 2355 * shift out the middle component (green) to 2356 * the high 16 bits, and process all three RGB 2357 * components at the same time. 2358 */ 2359 s = (s | s << 16) & 0x03e07c1f; 2360 d = (d | d << 16) & 0x03e07c1f; 2361 d += (s - d) * alpha >> 5; 2362 d &= 0x03e07c1f; 2363 *dstp++ = (Uint16)(d | d >> 16); 2364 },{ 2365 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ 2366 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ 2367 2368 /* red -- process the bits in place */ 2369 src2 = src1; 2370 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */ 2371 2372 dst2 = dst1; 2373 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */ 2374 2375 /* blend */ 2376 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2377 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2378 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ 2379 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2380 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */ 2381 2382 mm_res = dst2; /* RED -> mm_res */ 2383 2384 /* green -- process the bits in place */ 2385 src2 = src1; 2386 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ 2387 2388 dst2 = dst1; 2389 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ 2390 2391 /* blend */ 2392 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2393 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2394 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ 2395 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2396 2397 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ 2398 2399 /* blue */ 2400 src2 = src1; /* src -> src2 */ 2401 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ 2402 2403 dst2 = dst1; /* dst -> dst2 */ 2404 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ 2405 2406 /* blend */ 2407 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2408 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2409 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ 2410 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2411 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ 2412 2413 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ 2414 2415 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ 2416 2417 srcp += 4; 2418 dstp += 4; 2419 }, width); 2420 srcp += srcskip; 2421 dstp += dstskip; 2422 } 2423 _mm_empty(); 2424 } 2425 } 2426 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ 2427 2428 /* fast RGB565->RGB565 blending with surface alpha */ 2429 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) 2430 { 2431 unsigned alpha = info->src->alpha; 2432 if(alpha == 128) { 2433 Blit16to16SurfaceAlpha128(info, 0xf7de); 2434 } else { 2435 int width = info->d_width; 2436 int height = info->d_height; 2437 Uint16 *srcp = (Uint16 *)info->s_pixels; 2438 int srcskip = info->s_skip >> 1; 2439 Uint16 *dstp = (Uint16 *)info->d_pixels; 2440 int dstskip = info->d_skip >> 1; 2441 alpha >>= 3; /* downscale alpha to 5 bits */ 2442 2443 while(height--) { 2444 DUFFS_LOOP4({ 2445 Uint32 s = *srcp++; 2446 Uint32 d = *dstp; 2447 /* 2448 * shift out the middle component (green) to 2449 * the high 16 bits, and process all three RGB 2450 * components at the same time. 2451 */ 2452 s = (s | s << 16) & 0x07e0f81f; 2453 d = (d | d << 16) & 0x07e0f81f; 2454 d += (s - d) * alpha >> 5; 2455 d &= 0x07e0f81f; 2456 *dstp++ = (Uint16)(d | d >> 16); 2457 }, width); 2458 srcp += srcskip; 2459 dstp += dstskip; 2460 } 2461 } 2462 } 2463 2464 /* fast RGB555->RGB555 blending with surface alpha */ 2465 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info) 2466 { 2467 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ 2468 if(alpha == 128) { 2469 Blit16to16SurfaceAlpha128(info, 0xfbde); 2470 } else { 2471 int width = info->d_width; 2472 int height = info->d_height; 2473 Uint16 *srcp = (Uint16 *)info->s_pixels; 2474 int srcskip = info->s_skip >> 1; 2475 Uint16 *dstp = (Uint16 *)info->d_pixels; 2476 int dstskip = info->d_skip >> 1; 2477 alpha >>= 3; /* downscale alpha to 5 bits */ 2478 2479 while(height--) { 2480 DUFFS_LOOP4({ 2481 Uint32 s = *srcp++; 2482 Uint32 d = *dstp; 2483 /* 2484 * shift out the middle component (green) to 2485 * the high 16 bits, and process all three RGB 2486 * components at the same time. 2487 */ 2488 s = (s | s << 16) & 0x03e07c1f; 2489 d = (d | d << 16) & 0x03e07c1f; 2490 d += (s - d) * alpha >> 5; 2491 d &= 0x03e07c1f; 2492 *dstp++ = (Uint16)(d | d >> 16); 2493 }, width); 2494 srcp += srcskip; 2495 dstp += dstskip; 2496 } 2497 } 2498 } 2499 2500 /* fast ARGB8888->RGB565 blending with pixel alpha */ 2501 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info) 2502 { 2503 int width = info->d_width; 2504 int height = info->d_height; 2505 Uint32 *srcp = (Uint32 *)info->s_pixels; 2506 int srcskip = info->s_skip >> 2; 2507 Uint16 *dstp = (Uint16 *)info->d_pixels; 2508 int dstskip = info->d_skip >> 1; 2509 2510 while(height--) { 2511 DUFFS_LOOP4({ 2512 Uint32 s = *srcp; 2513 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */ 2514 /* FIXME: Here we special-case opaque alpha since the 2515 compositioning used (>>8 instead of /255) doesn't handle 2516 it correctly. Also special-case alpha=0 for speed? 2517 Benchmark this! */ 2518 if(alpha) { 2519 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { 2520 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f)); 2521 } else { 2522 Uint32 d = *dstp; 2523 /* 2524 * convert source and destination to G0RAB65565 2525 * and blend all components at the same time 2526 */ 2527 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) 2528 + (s >> 3 & 0x1f); 2529 d = (d | d << 16) & 0x07e0f81f; 2530 d += (s - d) * alpha >> 5; 2531 d &= 0x07e0f81f; 2532 *dstp = (Uint16)(d | d >> 16); 2533 } 2534 } 2535 srcp++; 2536 dstp++; 2537 }, width); 2538 srcp += srcskip; 2539 dstp += dstskip; 2540 } 2541 } 2542 2543 /* fast ARGB8888->RGB555 blending with pixel alpha */ 2544 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info) 2545 { 2546 int width = info->d_width; 2547 int height = info->d_height; 2548 Uint32 *srcp = (Uint32 *)info->s_pixels; 2549 int srcskip = info->s_skip >> 2; 2550 Uint16 *dstp = (Uint16 *)info->d_pixels; 2551 int dstskip = info->d_skip >> 1; 2552 2553 while(height--) { 2554 DUFFS_LOOP4({ 2555 unsigned alpha; 2556 Uint32 s = *srcp; 2557 alpha = s >> 27; /* downscale alpha to 5 bits */ 2558 /* FIXME: Here we special-case opaque alpha since the 2559 compositioning used (>>8 instead of /255) doesn't handle 2560 it correctly. Also special-case alpha=0 for speed? 2561 Benchmark this! */ 2562 if(alpha) { 2563 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { 2564 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f)); 2565 } else { 2566 Uint32 d = *dstp; 2567 /* 2568 * convert source and destination to G0RAB65565 2569 * and blend all components at the same time 2570 */ 2571 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) 2572 + (s >> 3 & 0x1f); 2573 d = (d | d << 16) & 0x03e07c1f; 2574 d += (s - d) * alpha >> 5; 2575 d &= 0x03e07c1f; 2576 *dstp = (Uint16)(d | d >> 16); 2577 } 2578 } 2579 srcp++; 2580 dstp++; 2581 }, width); 2582 srcp += srcskip; 2583 dstp += dstskip; 2584 } 2585 } 2586 2587 /* General (slow) N->N blending with per-surface alpha */ 2588 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info) 2589 { 2590 int width = info->d_width; 2591 int height = info->d_height; 2592 Uint8 *src = info->s_pixels; 2593 int srcskip = info->s_skip; 2594 Uint8 *dst = info->d_pixels; 2595 int dstskip = info->d_skip; 2596 SDL_PixelFormat *srcfmt = info->src; 2597 SDL_PixelFormat *dstfmt = info->dst; 2598 int srcbpp = srcfmt->BytesPerPixel; 2599 int dstbpp = dstfmt->BytesPerPixel; 2600 unsigned sA = srcfmt->alpha; 2601 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; 2602 2603 if(sA) { 2604 while ( height-- ) { 2605 DUFFS_LOOP4( 2606 { 2607 Uint32 Pixel; 2608 unsigned sR; 2609 unsigned sG; 2610 unsigned sB; 2611 unsigned dR; 2612 unsigned dG; 2613 unsigned dB; 2614 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 2615 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB); 2616 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); 2617 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 2618 src += srcbpp; 2619 dst += dstbpp; 2620 }, 2621 width); 2622 src += srcskip; 2623 dst += dstskip; 2624 } 2625 } 2626 } 2627 2628 /* General (slow) colorkeyed N->N blending with per-surface alpha */ 2629 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info) 2630 { 2631 int width = info->d_width; 2632 int height = info->d_height; 2633 Uint8 *src = info->s_pixels; 2634 int srcskip = info->s_skip; 2635 Uint8 *dst = info->d_pixels; 2636 int dstskip = info->d_skip; 2637 SDL_PixelFormat *srcfmt = info->src; 2638 SDL_PixelFormat *dstfmt = info->dst; 2639 Uint32 ckey = srcfmt->colorkey; 2640 int srcbpp = srcfmt->BytesPerPixel; 2641 int dstbpp = dstfmt->BytesPerPixel; 2642 unsigned sA = srcfmt->alpha; 2643 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; 2644 2645 while ( height-- ) { 2646 DUFFS_LOOP4( 2647 { 2648 Uint32 Pixel; 2649 unsigned sR; 2650 unsigned sG; 2651 unsigned sB; 2652 unsigned dR; 2653 unsigned dG; 2654 unsigned dB; 2655 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel); 2656 if(sA && Pixel != ckey) { 2657 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); 2658 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB); 2659 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); 2660 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 2661 } 2662 src += srcbpp; 2663 dst += dstbpp; 2664 }, 2665 width); 2666 src += srcskip; 2667 dst += dstskip; 2668 } 2669 } 2670 2671 /* General (slow) N->N blending with pixel alpha */ 2672 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) 2673 { 2674 int width = info->d_width; 2675 int height = info->d_height; 2676 Uint8 *src = info->s_pixels; 2677 int srcskip = info->s_skip; 2678 Uint8 *dst = info->d_pixels; 2679 int dstskip = info->d_skip; 2680 SDL_PixelFormat *srcfmt = info->src; 2681 SDL_PixelFormat *dstfmt = info->dst; 2682 2683 int srcbpp; 2684 int dstbpp; 2685 2686 /* Set up some basic variables */ 2687 srcbpp = srcfmt->BytesPerPixel; 2688 dstbpp = dstfmt->BytesPerPixel; 2689 2690 /* FIXME: for 8bpp source alpha, this doesn't get opaque values 2691 quite right. for <8bpp source alpha, it gets them very wrong 2692 (check all macros!) 2693 It is unclear whether there is a good general solution that doesn't 2694 need a branch (or a divide). */ 2695 while ( height-- ) { 2696 DUFFS_LOOP4( 2697 { 2698 Uint32 Pixel; 2699 unsigned sR; 2700 unsigned sG; 2701 unsigned sB; 2702 unsigned dR; 2703 unsigned dG; 2704 unsigned dB; 2705 unsigned sA; 2706 unsigned dA; 2707 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); 2708 if(sA) { 2709 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); 2710 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); 2711 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 2712 } 2713 src += srcbpp; 2714 dst += dstbpp; 2715 }, 2716 width); 2717 src += srcskip; 2718 dst += dstskip; 2719 } 2720 } 2721 2722 2723 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index) 2724 { 2725 SDL_PixelFormat *sf = surface->format; 2726 SDL_PixelFormat *df = surface->map->dst->format; 2727 2728 if(sf->Amask == 0) { 2729 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) { 2730 if(df->BytesPerPixel == 1) 2731 return BlitNto1SurfaceAlphaKey; 2732 else 2733 #if SDL_ALTIVEC_BLITTERS 2734 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 && 2735 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) 2736 return Blit32to32SurfaceAlphaKeyAltivec; 2737 else 2738 #endif 2739 return BlitNtoNSurfaceAlphaKey; 2740 } else { 2741 /* Per-surface alpha blits */ 2742 switch(df->BytesPerPixel) { 2743 case 1: 2744 return BlitNto1SurfaceAlpha; 2745 2746 case 2: 2747 if(surface->map->identity) { 2748 if(df->Gmask == 0x7e0) 2749 { 2750 #if MMX_ASMBLIT 2751 if(SDL_HasMMX()) 2752 return Blit565to565SurfaceAlphaMMX; 2753 else 2754 #endif 2755 return Blit565to565SurfaceAlpha; 2756 } 2757 else if(df->Gmask == 0x3e0) 2758 { 2759 #if MMX_ASMBLIT 2760 if(SDL_HasMMX()) 2761 return Blit555to555SurfaceAlphaMMX; 2762 else 2763 #endif 2764 return Blit555to555SurfaceAlpha; 2765 } 2766 } 2767 return BlitNtoNSurfaceAlpha; 2768 2769 case 4: 2770 if(sf->Rmask == df->Rmask 2771 && sf->Gmask == df->Gmask 2772 && sf->Bmask == df->Bmask 2773 && sf->BytesPerPixel == 4) 2774 { 2775 #if MMX_ASMBLIT 2776 if(sf->Rshift % 8 == 0 2777 && sf->Gshift % 8 == 0 2778 && sf->Bshift % 8 == 0 2779 && SDL_HasMMX()) 2780 return BlitRGBtoRGBSurfaceAlphaMMX; 2781 #endif 2782 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) 2783 { 2784 #if SDL_ALTIVEC_BLITTERS 2785 if(!(surface->map->dst->flags & SDL_HWSURFACE) 2786 && SDL_HasAltiVec()) 2787 return BlitRGBtoRGBSurfaceAlphaAltivec; 2788 #endif 2789 return BlitRGBtoRGBSurfaceAlpha; 2790 } 2791 } 2792 #if SDL_ALTIVEC_BLITTERS 2793 if((sf->BytesPerPixel == 4) && 2794 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) 2795 return Blit32to32SurfaceAlphaAltivec; 2796 else 2797 #endif 2798 return BlitNtoNSurfaceAlpha; 2799 2800 case 3: 2801 default: 2802 return BlitNtoNSurfaceAlpha; 2803 } 2804 } 2805 } else { 2806 /* Per-pixel alpha blits */ 2807 switch(df->BytesPerPixel) { 2808 case 1: 2809 return BlitNto1PixelAlpha; 2810 2811 case 2: 2812 #if SDL_ALTIVEC_BLITTERS 2813 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) && 2814 df->Gmask == 0x7e0 && 2815 df->Bmask == 0x1f && SDL_HasAltiVec()) 2816 return Blit32to565PixelAlphaAltivec; 2817 else 2818 #endif 2819 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 2820 && sf->Gmask == 0xff00 2821 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) 2822 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { 2823 if(df->Gmask == 0x7e0) 2824 return BlitARGBto565PixelAlpha; 2825 else if(df->Gmask == 0x3e0) 2826 return BlitARGBto555PixelAlpha; 2827 } 2828 return BlitNtoNPixelAlpha; 2829 2830 case 4: 2831 if(sf->Rmask == df->Rmask 2832 && sf->Gmask == df->Gmask 2833 && sf->Bmask == df->Bmask 2834 && sf->BytesPerPixel == 4) 2835 { 2836 #if MMX_ASMBLIT 2837 if(sf->Rshift % 8 == 0 2838 && sf->Gshift % 8 == 0 2839 && sf->Bshift % 8 == 0 2840 && sf->Ashift % 8 == 0 2841 && sf->Aloss == 0) 2842 { 2843 if(SDL_Has3DNow()) 2844 return BlitRGBtoRGBPixelAlphaMMX3DNOW; 2845 if(SDL_HasMMX()) 2846 return BlitRGBtoRGBPixelAlphaMMX; 2847 } 2848 #endif 2849 if(sf->Amask == 0xff000000) 2850 { 2851 #if SDL_ALTIVEC_BLITTERS 2852 if(!(surface->map->dst->flags & SDL_HWSURFACE) 2853 && SDL_HasAltiVec()) 2854 return BlitRGBtoRGBPixelAlphaAltivec; 2855 #endif 2856 return BlitRGBtoRGBPixelAlpha; 2857 } 2858 } 2859 #if SDL_ALTIVEC_BLITTERS 2860 if (sf->Amask && sf->BytesPerPixel == 4 && 2861 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) 2862 return Blit32to32PixelAlphaAltivec; 2863 else 2864 #endif 2865 return BlitNtoNPixelAlpha; 2866 2867 case 3: 2868 default: 2869 return BlitNtoNPixelAlpha; 2870 } 2871 } 2872 } 2873 2874