1 /* 2 SDL - Simple DirectMedia Layer 3 Copyright (C) 1997-2006 Sam Lantinga 4 5 This library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 This library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with this library; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 19 Sam Lantinga 20 slouken (at) libsdl.org 21 */ 22 #include "SDL_config.h" 23 24 #include "SDL_video.h" 25 #include "SDL_blit.h" 26 27 /* 28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on. 29 Checking if _mm_free is #defined in malloc.h is is the only way to 30 determine if the Processor Pack is installed, as far as I can tell. 31 */ 32 33 #if SDL_ASSEMBLY_ROUTINES 34 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 35 # define MMX_ASMBLIT 1 36 # define GCC_ASMBLIT 1 37 # elif defined(_MSC_VER) && defined(_M_IX86) 38 # if (_MSC_VER <= 1200) 39 # include <malloc.h> 40 # if defined(_mm_free) 41 # define HAVE_MMINTRIN_H 1 42 # endif 43 # else /* Visual Studio > VC6 always has mmintrin.h */ 44 # define HAVE_MMINTRIN_H 1 45 # endif 46 # if HAVE_MMINTRIN_H 47 # define MMX_ASMBLIT 1 48 # define MSVC_ASMBLIT 1 49 # endif 50 # endif 51 #endif /* SDL_ASSEMBLY_ROUTINES */ 52 53 /* Function to check the CPU flags */ 54 #include "SDL_cpuinfo.h" 55 #if GCC_ASMBLIT 56 #include "mmx.h" 57 #elif MSVC_ASMBLIT 58 #include <mmintrin.h> 59 #include <mm3dnow.h> 60 #endif 61 62 /* Functions to perform alpha blended blitting */ 63 64 /* N->1 blending with per-surface alpha */ 65 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info) 66 { 67 int width = info->d_width; 68 int height = info->d_height; 69 Uint8 *src = info->s_pixels; 70 int srcskip = info->s_skip; 71 Uint8 *dst = info->d_pixels; 72 int dstskip = info->d_skip; 73 Uint8 *palmap = info->table; 74 SDL_PixelFormat *srcfmt = info->src; 75 SDL_PixelFormat *dstfmt = info->dst; 76 int srcbpp = srcfmt->BytesPerPixel; 77 78 const unsigned A = srcfmt->alpha; 79 80 while ( height-- ) { 81 DUFFS_LOOP4( 82 { 83 Uint32 Pixel; 84 unsigned sR; 85 unsigned sG; 86 unsigned sB; 87 unsigned dR; 88 unsigned dG; 89 unsigned dB; 90 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 91 dR = dstfmt->palette->colors[*dst].r; 92 dG = dstfmt->palette->colors[*dst].g; 93 dB = dstfmt->palette->colors[*dst].b; 94 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB); 95 dR &= 0xff; 96 dG &= 0xff; 97 dB &= 0xff; 98 /* Pack RGB into 8bit pixel */ 99 if ( palmap == NULL ) { 100 *dst =((dR>>5)<<(3+2))| 101 ((dG>>5)<<(2))| 102 ((dB>>6)<<(0)); 103 } else { 104 *dst = palmap[((dR>>5)<<(3+2))| 105 ((dG>>5)<<(2)) | 106 ((dB>>6)<<(0))]; 107 } 108 dst++; 109 src += srcbpp; 110 }, 111 width); 112 src += srcskip; 113 dst += dstskip; 114 } 115 } 116 117 /* N->1 blending with pixel alpha */ 118 static void BlitNto1PixelAlpha(SDL_BlitInfo *info) 119 { 120 int width = info->d_width; 121 int height = info->d_height; 122 Uint8 *src = info->s_pixels; 123 int srcskip = info->s_skip; 124 Uint8 *dst = info->d_pixels; 125 int dstskip = info->d_skip; 126 Uint8 *palmap = info->table; 127 SDL_PixelFormat *srcfmt = info->src; 128 SDL_PixelFormat *dstfmt = info->dst; 129 int srcbpp = srcfmt->BytesPerPixel; 130 131 /* FIXME: fix alpha bit field expansion here too? */ 132 while ( height-- ) { 133 DUFFS_LOOP4( 134 { 135 Uint32 Pixel; 136 unsigned sR; 137 unsigned sG; 138 unsigned sB; 139 unsigned sA; 140 unsigned dR; 141 unsigned dG; 142 unsigned dB; 143 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA); 144 dR = dstfmt->palette->colors[*dst].r; 145 dG = dstfmt->palette->colors[*dst].g; 146 dB = dstfmt->palette->colors[*dst].b; 147 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); 148 dR &= 0xff; 149 dG &= 0xff; 150 dB &= 0xff; 151 /* Pack RGB into 8bit pixel */ 152 if ( palmap == NULL ) { 153 *dst =((dR>>5)<<(3+2))| 154 ((dG>>5)<<(2))| 155 ((dB>>6)<<(0)); 156 } else { 157 *dst = palmap[((dR>>5)<<(3+2))| 158 ((dG>>5)<<(2)) | 159 ((dB>>6)<<(0)) ]; 160 } 161 dst++; 162 src += srcbpp; 163 }, 164 width); 165 src += srcskip; 166 dst += dstskip; 167 } 168 } 169 170 /* colorkeyed N->1 blending with per-surface alpha */ 171 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info) 172 { 173 int width = info->d_width; 174 int height = info->d_height; 175 Uint8 *src = info->s_pixels; 176 int srcskip = info->s_skip; 177 Uint8 *dst = info->d_pixels; 178 int dstskip = info->d_skip; 179 Uint8 *palmap = info->table; 180 SDL_PixelFormat *srcfmt = info->src; 181 SDL_PixelFormat *dstfmt = info->dst; 182 int srcbpp = srcfmt->BytesPerPixel; 183 Uint32 ckey = srcfmt->colorkey; 184 185 const int A = srcfmt->alpha; 186 187 while ( height-- ) { 188 DUFFS_LOOP( 189 { 190 Uint32 Pixel; 191 unsigned sR; 192 unsigned sG; 193 unsigned sB; 194 unsigned dR; 195 unsigned dG; 196 unsigned dB; 197 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 198 if ( Pixel != ckey ) { 199 dR = dstfmt->palette->colors[*dst].r; 200 dG = dstfmt->palette->colors[*dst].g; 201 dB = dstfmt->palette->colors[*dst].b; 202 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB); 203 dR &= 0xff; 204 dG &= 0xff; 205 dB &= 0xff; 206 /* Pack RGB into 8bit pixel */ 207 if ( palmap == NULL ) { 208 *dst =((dR>>5)<<(3+2))| 209 ((dG>>5)<<(2)) | 210 ((dB>>6)<<(0)); 211 } else { 212 *dst = palmap[((dR>>5)<<(3+2))| 213 ((dG>>5)<<(2)) | 214 ((dB>>6)<<(0)) ]; 215 } 216 } 217 dst++; 218 src += srcbpp; 219 }, 220 width); 221 src += srcskip; 222 dst += dstskip; 223 } 224 } 225 226 #if GCC_ASMBLIT 227 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 228 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) 229 { 230 int width = info->d_width; 231 int height = info->d_height; 232 Uint32 *srcp = (Uint32 *)info->s_pixels; 233 int srcskip = info->s_skip >> 2; 234 Uint32 *dstp = (Uint32 *)info->d_pixels; 235 int dstskip = info->d_skip >> 2; 236 Uint32 dalpha = info->dst->Amask; 237 Uint8 load[8]; 238 239 *(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */ 240 movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */ 241 *(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */ 242 movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */ 243 movd_m2r(dalpha, mm7); /* dst alpha mask */ 244 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ 245 while(height--) { 246 DUFFS_LOOP_DOUBLE2( 247 { 248 Uint32 s = *srcp++; 249 Uint32 d = *dstp; 250 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) 251 + (s & d & 0x00010101)) | dalpha; 252 },{ 253 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ 254 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ 255 256 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */ 257 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */ 258 259 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */ 260 pand_r2r(mm4, mm5); /* src & mask -> mm5 */ 261 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */ 262 pand_r2r(mm1, mm2); /* src & dst -> mm2 */ 263 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */ 264 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */ 265 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */ 266 267 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ 268 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */ 269 dstp += 2; 270 srcp += 2; 271 }, width); 272 srcp += srcskip; 273 dstp += dstskip; 274 } 275 emms(); 276 } 277 278 /* fast RGB888->(A)RGB888 blending with surface alpha */ 279 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) 280 { 281 SDL_PixelFormat* df = info->dst; 282 unsigned alpha = info->src->alpha; 283 284 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { 285 /* only call a128 version when R,G,B occupy lower bits */ 286 BlitRGBtoRGBSurfaceAlpha128MMX(info); 287 } else { 288 int width = info->d_width; 289 int height = info->d_height; 290 Uint32 *srcp = (Uint32 *)info->s_pixels; 291 int srcskip = info->s_skip >> 2; 292 Uint32 *dstp = (Uint32 *)info->d_pixels; 293 int dstskip = info->d_skip >> 2; 294 295 pxor_r2r(mm5, mm5); /* 0 -> mm5 */ 296 /* form the alpha mult */ 297 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */ 298 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ 299 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ 300 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift); 301 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */ 302 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */ 303 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */ 304 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */ 305 movd_m2r(df->Amask, mm7); /* dst alpha mask */ 306 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ 307 308 while(height--) { 309 DUFFS_LOOP_DOUBLE2({ 310 /* One Pixel Blend */ 311 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ 312 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ 313 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */ 314 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */ 315 316 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ 317 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ 318 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ 319 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ 320 321 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */ 322 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ 323 movd_r2m(mm2, *dstp);/* mm2 -> pixel */ 324 ++srcp; 325 ++dstp; 326 },{ 327 /* Two Pixels Blend */ 328 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/ 329 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ 330 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */ 331 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ 332 333 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */ 334 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */ 335 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */ 336 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */ 337 338 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */ 339 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */ 340 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */ 341 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */ 342 343 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */ 344 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ 345 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ 346 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */ 347 348 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */ 349 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */ 350 351 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */ 352 353 srcp += 2; 354 dstp += 2; 355 }, width); 356 srcp += srcskip; 357 dstp += dstskip; 358 } 359 emms(); 360 } 361 } 362 363 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ 364 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) 365 { 366 int width = info->d_width; 367 int height = info->d_height; 368 Uint32 *srcp = (Uint32 *)info->s_pixels; 369 int srcskip = info->s_skip >> 2; 370 Uint32 *dstp = (Uint32 *)info->d_pixels; 371 int dstskip = info->d_skip >> 2; 372 SDL_PixelFormat* sf = info->src; 373 Uint32 amask = sf->Amask; 374 375 pxor_r2r(mm6, mm6); /* 0 -> mm6 */ 376 /* form multiplication mask */ 377 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */ 378 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */ 379 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */ 380 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */ 381 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */ 382 /* form channel masks */ 383 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */ 384 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */ 385 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */ 386 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */ 387 /* get alpha channel shift */ 388 __asm__ __volatile__ ( 389 "movd %0, %%mm5" 390 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */ 391 392 while(height--) { 393 DUFFS_LOOP4({ 394 Uint32 alpha = *srcp & amask; 395 /* FIXME: Here we special-case opaque alpha since the 396 compositioning used (>>8 instead of /255) doesn't handle 397 it correctly. Also special-case alpha=0 for speed? 398 Benchmark this! */ 399 if(alpha == 0) { 400 /* do nothing */ 401 } else if(alpha == amask) { 402 /* opaque alpha -- copy RGB, keep dst alpha */ 403 /* using MMX here to free up regular registers for other things */ 404 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ 405 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ 406 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */ 407 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */ 408 por_r2r(mm1, mm2); /* src | dst -> mm2 */ 409 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */ 410 } else { 411 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ 412 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */ 413 414 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ 415 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */ 416 417 __asm__ __volatile__ ( 418 "movd %0, %%mm4" 419 : : "r" (alpha) ); /* 0000A000 -> mm4 */ 420 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */ 421 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ 422 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ 423 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */ 424 425 /* blend */ 426 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ 427 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ 428 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */ 429 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ 430 431 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */ 432 movd_r2m(mm2, *dstp);/* mm2 -> dst */ 433 } 434 ++srcp; 435 ++dstp; 436 }, width); 437 srcp += srcskip; 438 dstp += dstskip; 439 } 440 emms(); 441 } 442 /* End GCC_ASMBLIT */ 443 444 #elif MSVC_ASMBLIT 445 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 446 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) 447 { 448 int width = info->d_width; 449 int height = info->d_height; 450 Uint32 *srcp = (Uint32 *)info->s_pixels; 451 int srcskip = info->s_skip >> 2; 452 Uint32 *dstp = (Uint32 *)info->d_pixels; 453 int dstskip = info->d_skip >> 2; 454 Uint32 dalpha = info->dst->Amask; 455 456 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta; 457 458 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */ 459 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */ 460 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ 461 462 while (height--) { 463 int n = width; 464 if ( n & 1 ) { 465 Uint32 s = *srcp++; 466 Uint32 d = *dstp; 467 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) 468 + (s & d & 0x00010101)) | dalpha; 469 n--; 470 } 471 472 for (n >>= 1; n > 0; --n) { 473 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */ 474 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ 475 476 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */ 477 src2 = src1; /* 2 x src -> src2(ARGBARGB) */ 478 479 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */ 480 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */ 481 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */ 482 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */ 483 484 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */ 485 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */ 486 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */ 487 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */ 488 489 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */ 490 dstp += 2; 491 srcp += 2; 492 } 493 494 srcp += srcskip; 495 dstp += dstskip; 496 } 497 _mm_empty(); 498 } 499 500 /* fast RGB888->(A)RGB888 blending with surface alpha */ 501 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) 502 { 503 SDL_PixelFormat* df = info->dst; 504 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask; 505 unsigned alpha = info->src->alpha; 506 507 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { 508 /* only call a128 version when R,G,B occupy lower bits */ 509 BlitRGBtoRGBSurfaceAlpha128MMX(info); 510 } else { 511 int width = info->d_width; 512 int height = info->d_height; 513 Uint32 *srcp = (Uint32 *)info->s_pixels; 514 int srcskip = info->s_skip >> 2; 515 Uint32 *dstp = (Uint32 *)info->d_pixels; 516 int dstskip = info->d_skip >> 2; 517 Uint32 dalpha = df->Amask; 518 Uint32 amult; 519 520 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta; 521 522 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 523 /* form the alpha mult */ 524 amult = alpha | (alpha << 8); 525 amult = amult | (amult << 16); 526 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift); 527 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */ 528 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */ 529 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */ 530 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ 531 532 while (height--) { 533 int n = width; 534 if (n & 1) { 535 /* One Pixel Blend */ 536 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/ 537 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */ 538 539 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ 540 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ 541 542 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */ 543 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 544 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ 545 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */ 546 547 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ 548 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ 549 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ 550 551 ++srcp; 552 ++dstp; 553 554 n--; 555 } 556 557 for (n >>= 1; n > 0; --n) { 558 /* Two Pixels Blend */ 559 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/ 560 src2 = src1; /* 2 x src -> src2(ARGBARGB) */ 561 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */ 562 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */ 563 564 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */ 565 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ 566 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */ 567 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */ 568 569 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ 570 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */ 571 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */ 572 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */ 573 574 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */ 575 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 576 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ 577 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */ 578 579 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */ 580 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ 581 582 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */ 583 584 srcp += 2; 585 dstp += 2; 586 } 587 srcp += srcskip; 588 dstp += dstskip; 589 } 590 _mm_empty(); 591 } 592 } 593 594 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ 595 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) 596 { 597 int width = info->d_width; 598 int height = info->d_height; 599 Uint32 *srcp = (Uint32 *)info->s_pixels; 600 int srcskip = info->s_skip >> 2; 601 Uint32 *dstp = (Uint32 *)info->d_pixels; 602 int dstskip = info->d_skip >> 2; 603 SDL_PixelFormat* sf = info->src; 604 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; 605 Uint32 amask = sf->Amask; 606 Uint32 ashift = sf->Ashift; 607 Uint64 multmask; 608 609 __m64 src1, dst1, mm_alpha, mm_zero, dmask; 610 611 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 612 multmask = ~(0xFFFFi64 << (ashift * 2)); 613 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */ 614 615 while(height--) { 616 DUFFS_LOOP4({ 617 Uint32 alpha = *srcp & amask; 618 if (alpha == 0) { 619 /* do nothing */ 620 } else if (alpha == amask) { 621 /* opaque alpha -- copy RGB, keep dst alpha */ 622 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); 623 } else { 624 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ 625 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ 626 627 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ 628 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ 629 630 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ 631 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ 632 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 633 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ 634 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ 635 636 /* blend */ 637 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ 638 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */ 639 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ 640 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */ 641 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ 642 643 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ 644 } 645 ++srcp; 646 ++dstp; 647 }, width); 648 srcp += srcskip; 649 dstp += dstskip; 650 } 651 _mm_empty(); 652 } 653 /* End MSVC_ASMBLIT */ 654 655 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ 656 657 #if SDL_ALTIVEC_BLITTERS 658 #if __MWERKS__ 659 #pragma altivec_model on 660 #endif 661 #if HAVE_ALTIVEC_H 662 #include <altivec.h> 663 #endif 664 #include <assert.h> 665 666 #if (defined(__MACOSX__) && (__GNUC__ < 4)) 667 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ 668 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p ) 669 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \ 670 (vector unsigned short) ( a,b,c,d,e,f,g,h ) 671 #else 672 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ 673 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p } 674 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \ 675 (vector unsigned short) { a,b,c,d,e,f,g,h } 676 #endif 677 678 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F) 679 #define VECPRINT(msg, v) do { \ 680 vector unsigned int tmpvec = (vector unsigned int)(v); \ 681 unsigned int *vp = (unsigned int *)&tmpvec; \ 682 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \ 683 } while (0) 684 685 /* the permuation vector that takes the high bytes out of all the appropriate shorts 686 (vector unsigned char)( 687 0x00, 0x10, 0x02, 0x12, 688 0x04, 0x14, 0x06, 0x16, 689 0x08, 0x18, 0x0A, 0x1A, 690 0x0C, 0x1C, 0x0E, 0x1E ); 691 */ 692 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F))) 693 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12))) 694 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24())) 695 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \ 696 ? vec_lvsl(0, src) \ 697 : vec_add(vec_lvsl(8, src), vec_splat_u8(8))) 698 699 700 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \ 701 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \ 702 vector unsigned short vtemp1 = vec_mule(vs, valpha); \ 703 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \ 704 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \ 705 /* valpha2 is 255-alpha */ \ 706 vector unsigned char valpha2 = vec_nor(valpha, valpha); \ 707 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \ 708 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \ 709 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \ 710 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \ 711 /* add source and dest */ \ 712 vtemp1 = vec_add(vtemp1, vtemp3); \ 713 vtemp2 = vec_add(vtemp2, vtemp4); \ 714 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \ 715 vtemp1 = vec_add(vtemp1, v1_16); \ 716 vtemp3 = vec_sr(vtemp1, v8_16); \ 717 vtemp1 = vec_add(vtemp1, vtemp3); \ 718 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \ 719 vtemp2 = vec_add(vtemp2, v1_16); \ 720 vtemp4 = vec_sr(vtemp2, v8_16); \ 721 vtemp2 = vec_add(vtemp2, vtemp4); \ 722 /* (>>8) and get ARGBARGBARGBARGB */ \ 723 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \ 724 } while (0) 725 726 /* Calculate the permute vector used for 32->32 swizzling */ 727 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt, 728 const SDL_PixelFormat *dstfmt) 729 { 730 /* 731 * We have to assume that the bits that aren't used by other 732 * colors is alpha, and it's one complete byte, since some formats 733 * leave alpha with a zero mask, but we should still swizzle the bits. 734 */ 735 /* ARGB */ 736 const static struct SDL_PixelFormat default_pixel_format = { 737 NULL, 0, 0, 738 0, 0, 0, 0, 739 16, 8, 0, 24, 740 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, 741 0, 0}; 742 if (!srcfmt) { 743 srcfmt = &default_pixel_format; 744 } 745 if (!dstfmt) { 746 dstfmt = &default_pixel_format; 747 } 748 const vector unsigned char plus = VECUINT8_LITERAL 749 ( 0x00, 0x00, 0x00, 0x00, 750 0x04, 0x04, 0x04, 0x04, 751 0x08, 0x08, 0x08, 0x08, 752 0x0C, 0x0C, 0x0C, 0x0C ); 753 vector unsigned char vswiz; 754 vector unsigned int srcvec; 755 #define RESHIFT(X) (3 - ((X) >> 3)) 756 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift); 757 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift); 758 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift); 759 Uint32 amask; 760 /* Use zero for alpha if either surface doesn't have alpha */ 761 if (dstfmt->Amask) { 762 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift); 763 } else { 764 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF); 765 } 766 #undef RESHIFT 767 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask); 768 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0)); 769 return(vswiz); 770 } 771 772 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info) 773 { 774 int height = info->d_height; 775 Uint8 *src = (Uint8 *)info->s_pixels; 776 int srcskip = info->s_skip; 777 Uint8 *dst = (Uint8 *)info->d_pixels; 778 int dstskip = info->d_skip; 779 SDL_PixelFormat *srcfmt = info->src; 780 781 vector unsigned char v0 = vec_splat_u8(0); 782 vector unsigned short v8_16 = vec_splat_u16(8); 783 vector unsigned short v1_16 = vec_splat_u16(1); 784 vector unsigned short v2_16 = vec_splat_u16(2); 785 vector unsigned short v3_16 = vec_splat_u16(3); 786 vector unsigned int v8_32 = vec_splat_u32(8); 787 vector unsigned int v16_32 = vec_add(v8_32, v8_32); 788 vector unsigned short v3f = VECUINT16_LITERAL( 789 0x003f, 0x003f, 0x003f, 0x003f, 790 0x003f, 0x003f, 0x003f, 0x003f); 791 vector unsigned short vfc = VECUINT16_LITERAL( 792 0x00fc, 0x00fc, 0x00fc, 0x00fc, 793 0x00fc, 0x00fc, 0x00fc, 0x00fc); 794 795 /* 796 0x10 - 0x1f is the alpha 797 0x00 - 0x0e evens are the red 798 0x01 - 0x0f odds are zero 799 */ 800 vector unsigned char vredalpha1 = VECUINT8_LITERAL( 801 0x10, 0x00, 0x01, 0x01, 802 0x10, 0x02, 0x01, 0x01, 803 0x10, 0x04, 0x01, 0x01, 804 0x10, 0x06, 0x01, 0x01 805 ); 806 vector unsigned char vredalpha2 = (vector unsigned char)( 807 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32)) 808 ); 809 /* 810 0x00 - 0x0f is ARxx ARxx ARxx ARxx 811 0x11 - 0x0f odds are blue 812 */ 813 vector unsigned char vblue1 = VECUINT8_LITERAL( 814 0x00, 0x01, 0x02, 0x11, 815 0x04, 0x05, 0x06, 0x13, 816 0x08, 0x09, 0x0a, 0x15, 817 0x0c, 0x0d, 0x0e, 0x17 818 ); 819 vector unsigned char vblue2 = (vector unsigned char)( 820 vec_add((vector unsigned int)vblue1, v8_32) 821 ); 822 /* 823 0x00 - 0x0f is ARxB ARxB ARxB ARxB 824 0x10 - 0x0e evens are green 825 */ 826 vector unsigned char vgreen1 = VECUINT8_LITERAL( 827 0x00, 0x01, 0x10, 0x03, 828 0x04, 0x05, 0x12, 0x07, 829 0x08, 0x09, 0x14, 0x0b, 830 0x0c, 0x0d, 0x16, 0x0f 831 ); 832 vector unsigned char vgreen2 = (vector unsigned char)( 833 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32)) 834 ); 835 vector unsigned char vgmerge = VECUINT8_LITERAL( 836 0x00, 0x02, 0x00, 0x06, 837 0x00, 0x0a, 0x00, 0x0e, 838 0x00, 0x12, 0x00, 0x16, 839 0x00, 0x1a, 0x00, 0x1e); 840 vector unsigned char mergePermute = VEC_MERGE_PERMUTE(); 841 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL); 842 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); 843 844 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7); 845 vf800 = vec_sl(vf800, vec_splat_u16(8)); 846 847 while(height--) { 848 int extrawidth; 849 vector unsigned char valigner; 850 vector unsigned char vsrc; 851 vector unsigned char voverflow; 852 int width = info->d_width; 853 854 #define ONE_PIXEL_BLEND(condition, widthvar) \ 855 while (condition) { \ 856 Uint32 Pixel; \ 857 unsigned sR, sG, sB, dR, dG, dB, sA; \ 858 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \ 859 if(sA) { \ 860 unsigned short dstpixel = *((unsigned short *)dst); \ 861 dR = (dstpixel >> 8) & 0xf8; \ 862 dG = (dstpixel >> 3) & 0xfc; \ 863 dB = (dstpixel << 3) & 0xf8; \ 864 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ 865 *((unsigned short *)dst) = ( \ 866 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \ 867 ); \ 868 } \ 869 src += 4; \ 870 dst += 2; \ 871 widthvar--; \ 872 } 873 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width); 874 extrawidth = (width % 8); 875 valigner = VEC_ALIGNER(src); 876 vsrc = (vector unsigned char)vec_ld(0, src); 877 width -= extrawidth; 878 while (width) { 879 vector unsigned char valpha; 880 vector unsigned char vsrc1, vsrc2; 881 vector unsigned char vdst1, vdst2; 882 vector unsigned short vR, vG, vB; 883 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel; 884 885 /* Load 8 pixels from src as ARGB */ 886 voverflow = (vector unsigned char)vec_ld(15, src); 887 vsrc = vec_perm(vsrc, voverflow, valigner); 888 vsrc1 = vec_perm(vsrc, vsrc, vpermute); 889 src += 16; 890 vsrc = (vector unsigned char)vec_ld(15, src); 891 voverflow = vec_perm(voverflow, vsrc, valigner); 892 vsrc2 = vec_perm(voverflow, voverflow, vpermute); 893 src += 16; 894 895 /* Load 8 pixels from dst as XRGB */ 896 voverflow = vec_ld(0, dst); 897 vR = vec_and((vector unsigned short)voverflow, vf800); 898 vB = vec_sl((vector unsigned short)voverflow, v3_16); 899 vG = vec_sl(vB, v2_16); 900 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1); 901 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1); 902 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1); 903 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2); 904 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2); 905 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2); 906 907 /* Alpha blend 8 pixels as ARGB */ 908 valpha = vec_perm(vsrc1, v0, valphaPermute); 909 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16); 910 valpha = vec_perm(vsrc2, v0, valphaPermute); 911 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16); 912 913 /* Convert 8 pixels to 565 */ 914 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2); 915 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge); 916 vgpixel = vec_and(vgpixel, vfc); 917 vgpixel = vec_sl(vgpixel, v3_16); 918 vrpixel = vec_sl(vpixel, v1_16); 919 vrpixel = vec_and(vrpixel, vf800); 920 vbpixel = vec_and(vpixel, v3f); 921 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel); 922 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel); 923 924 /* Store 8 pixels */ 925 vec_st(vdst1, 0, dst); 926 927 width -= 8; 928 dst += 16; 929 } 930 ONE_PIXEL_BLEND((extrawidth), extrawidth); 931 #undef ONE_PIXEL_BLEND 932 src += srcskip; 933 dst += dstskip; 934 } 935 } 936 937 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info) 938 { 939 unsigned alpha = info->src->alpha; 940 int height = info->d_height; 941 Uint32 *srcp = (Uint32 *)info->s_pixels; 942 int srcskip = info->s_skip >> 2; 943 Uint32 *dstp = (Uint32 *)info->d_pixels; 944 int dstskip = info->d_skip >> 2; 945 SDL_PixelFormat *srcfmt = info->src; 946 SDL_PixelFormat *dstfmt = info->dst; 947 unsigned sA = srcfmt->alpha; 948 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; 949 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask; 950 Uint32 ckey = info->src->colorkey; 951 vector unsigned char mergePermute; 952 vector unsigned char vsrcPermute; 953 vector unsigned char vdstPermute; 954 vector unsigned char vsdstPermute; 955 vector unsigned char valpha; 956 vector unsigned char valphamask; 957 vector unsigned char vbits; 958 vector unsigned char v0; 959 vector unsigned short v1; 960 vector unsigned short v8; 961 vector unsigned int vckey; 962 vector unsigned int vrgbmask; 963 964 mergePermute = VEC_MERGE_PERMUTE(); 965 v0 = vec_splat_u8(0); 966 v1 = vec_splat_u16(1); 967 v8 = vec_splat_u16(8); 968 969 /* set the alpha to 255 on the destination surf */ 970 valphamask = VEC_ALPHA_MASK(); 971 972 vsrcPermute = calc_swizzle32(srcfmt, NULL); 973 vdstPermute = calc_swizzle32(NULL, dstfmt); 974 vsdstPermute = calc_swizzle32(dstfmt, NULL); 975 976 /* set a vector full of alpha and 255-alpha */ 977 ((unsigned char *)&valpha)[0] = alpha; 978 valpha = vec_splat(valpha, 0); 979 vbits = (vector unsigned char)vec_splat_s8(-1); 980 981 ckey &= rgbmask; 982 ((unsigned int *)(char*)&vckey)[0] = ckey; 983 vckey = vec_splat(vckey, 0); 984 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask; 985 vrgbmask = vec_splat(vrgbmask, 0); 986 987 while(height--) { 988 int width = info->d_width; 989 #define ONE_PIXEL_BLEND(condition, widthvar) \ 990 while (condition) { \ 991 Uint32 Pixel; \ 992 unsigned sR, sG, sB, dR, dG, dB; \ 993 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \ 994 if(sA && Pixel != ckey) { \ 995 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \ 996 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \ 997 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ 998 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ 999 } \ 1000 dstp++; \ 1001 srcp++; \ 1002 widthvar--; \ 1003 } 1004 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); 1005 if (width > 0) { 1006 int extrawidth = (width % 4); 1007 vector unsigned char valigner = VEC_ALIGNER(srcp); 1008 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); 1009 width -= extrawidth; 1010 while (width) { 1011 vector unsigned char vsel; 1012 vector unsigned char voverflow; 1013 vector unsigned char vd; 1014 vector unsigned char vd_orig; 1015 1016 /* s = *srcp */ 1017 voverflow = (vector unsigned char)vec_ld(15, srcp); 1018 vs = vec_perm(vs, voverflow, valigner); 1019 1020 /* vsel is set for items that match the key */ 1021 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask); 1022 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey); 1023 1024 /* permute to source format */ 1025 vs = vec_perm(vs, valpha, vsrcPermute); 1026 1027 /* d = *dstp */ 1028 vd = (vector unsigned char)vec_ld(0, dstp); 1029 vd_orig = vd = vec_perm(vd, v0, vsdstPermute); 1030 1031 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); 1032 1033 /* set the alpha channel to full on */ 1034 vd = vec_or(vd, valphamask); 1035 1036 /* mask out color key */ 1037 vd = vec_sel(vd, vd_orig, vsel); 1038 1039 /* permute to dest format */ 1040 vd = vec_perm(vd, vbits, vdstPermute); 1041 1042 /* *dstp = res */ 1043 vec_st((vector unsigned int)vd, 0, dstp); 1044 1045 srcp += 4; 1046 dstp += 4; 1047 width -= 4; 1048 vs = voverflow; 1049 } 1050 ONE_PIXEL_BLEND((extrawidth), extrawidth); 1051 } 1052 #undef ONE_PIXEL_BLEND 1053 1054 srcp += srcskip; 1055 dstp += dstskip; 1056 } 1057 } 1058 1059 1060 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info) 1061 { 1062 int width = info->d_width; 1063 int height = info->d_height; 1064 Uint32 *srcp = (Uint32 *)info->s_pixels; 1065 int srcskip = info->s_skip >> 2; 1066 Uint32 *dstp = (Uint32 *)info->d_pixels; 1067 int dstskip = info->d_skip >> 2; 1068 SDL_PixelFormat *srcfmt = info->src; 1069 SDL_PixelFormat *dstfmt = info->dst; 1070 vector unsigned char mergePermute; 1071 vector unsigned char valphaPermute; 1072 vector unsigned char vsrcPermute; 1073 vector unsigned char vdstPermute; 1074 vector unsigned char vsdstPermute; 1075 vector unsigned char valphamask; 1076 vector unsigned char vpixelmask; 1077 vector unsigned char v0; 1078 vector unsigned short v1; 1079 vector unsigned short v8; 1080 1081 v0 = vec_splat_u8(0); 1082 v1 = vec_splat_u16(1); 1083 v8 = vec_splat_u16(8); 1084 mergePermute = VEC_MERGE_PERMUTE(); 1085 valphamask = VEC_ALPHA_MASK(); 1086 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); 1087 vpixelmask = vec_nor(valphamask, v0); 1088 vsrcPermute = calc_swizzle32(srcfmt, NULL); 1089 vdstPermute = calc_swizzle32(NULL, dstfmt); 1090 vsdstPermute = calc_swizzle32(dstfmt, NULL); 1091 1092 while ( height-- ) { 1093 width = info->d_width; 1094 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ 1095 Uint32 Pixel; \ 1096 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \ 1097 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \ 1098 if(sA) { \ 1099 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \ 1100 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ 1101 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \ 1102 } \ 1103 ++srcp; \ 1104 ++dstp; \ 1105 widthvar--; \ 1106 } 1107 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); 1108 if (width > 0) { 1109 /* vsrcPermute */ 1110 /* vdstPermute */ 1111 int extrawidth = (width % 4); 1112 vector unsigned char valigner = VEC_ALIGNER(srcp); 1113 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); 1114 width -= extrawidth; 1115 while (width) { 1116 vector unsigned char voverflow; 1117 vector unsigned char vd; 1118 vector unsigned char valpha; 1119 vector unsigned char vdstalpha; 1120 /* s = *srcp */ 1121 voverflow = (vector unsigned char)vec_ld(15, srcp); 1122 vs = vec_perm(vs, voverflow, valigner); 1123 vs = vec_perm(vs, v0, vsrcPermute); 1124 1125 valpha = vec_perm(vs, v0, valphaPermute); 1126 1127 /* d = *dstp */ 1128 vd = (vector unsigned char)vec_ld(0, dstp); 1129 vd = vec_perm(vd, v0, vsdstPermute); 1130 vdstalpha = vec_and(vd, valphamask); 1131 1132 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); 1133 1134 /* set the alpha to the dest alpha */ 1135 vd = vec_and(vd, vpixelmask); 1136 vd = vec_or(vd, vdstalpha); 1137 vd = vec_perm(vd, v0, vdstPermute); 1138 1139 /* *dstp = res */ 1140 vec_st((vector unsigned int)vd, 0, dstp); 1141 1142 srcp += 4; 1143 dstp += 4; 1144 width -= 4; 1145 vs = voverflow; 1146 1147 } 1148 ONE_PIXEL_BLEND((extrawidth), extrawidth); 1149 } 1150 srcp += srcskip; 1151 dstp += dstskip; 1152 #undef ONE_PIXEL_BLEND 1153 } 1154 } 1155 1156 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ 1157 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info) 1158 { 1159 int width = info->d_width; 1160 int height = info->d_height; 1161 Uint32 *srcp = (Uint32 *)info->s_pixels; 1162 int srcskip = info->s_skip >> 2; 1163 Uint32 *dstp = (Uint32 *)info->d_pixels; 1164 int dstskip = info->d_skip >> 2; 1165 vector unsigned char mergePermute; 1166 vector unsigned char valphaPermute; 1167 vector unsigned char valphamask; 1168 vector unsigned char vpixelmask; 1169 vector unsigned char v0; 1170 vector unsigned short v1; 1171 vector unsigned short v8; 1172 v0 = vec_splat_u8(0); 1173 v1 = vec_splat_u16(1); 1174 v8 = vec_splat_u16(8); 1175 mergePermute = VEC_MERGE_PERMUTE(); 1176 valphamask = VEC_ALPHA_MASK(); 1177 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); 1178 1179 1180 vpixelmask = vec_nor(valphamask, v0); 1181 while(height--) { 1182 width = info->d_width; 1183 #define ONE_PIXEL_BLEND(condition, widthvar) \ 1184 while ((condition)) { \ 1185 Uint32 dalpha; \ 1186 Uint32 d; \ 1187 Uint32 s1; \ 1188 Uint32 d1; \ 1189 Uint32 s = *srcp; \ 1190 Uint32 alpha = s >> 24; \ 1191 if(alpha) { \ 1192 if(alpha == SDL_ALPHA_OPAQUE) { \ 1193 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \ 1194 } else { \ 1195 d = *dstp; \ 1196 dalpha = d & 0xff000000; \ 1197 s1 = s & 0xff00ff; \ 1198 d1 = d & 0xff00ff; \ 1199 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \ 1200 s &= 0xff00; \ 1201 d &= 0xff00; \ 1202 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ 1203 *dstp = d1 | d | dalpha; \ 1204 } \ 1205 } \ 1206 ++srcp; \ 1207 ++dstp; \ 1208 widthvar--; \ 1209 } 1210 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); 1211 if (width > 0) { 1212 int extrawidth = (width % 4); 1213 vector unsigned char valigner = VEC_ALIGNER(srcp); 1214 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); 1215 width -= extrawidth; 1216 while (width) { 1217 vector unsigned char voverflow; 1218 vector unsigned char vd; 1219 vector unsigned char valpha; 1220 vector unsigned char vdstalpha; 1221 /* s = *srcp */ 1222 voverflow = (vector unsigned char)vec_ld(15, srcp); 1223 vs = vec_perm(vs, voverflow, valigner); 1224 1225 valpha = vec_perm(vs, v0, valphaPermute); 1226 1227 /* d = *dstp */ 1228 vd = (vector unsigned char)vec_ld(0, dstp); 1229 vdstalpha = vec_and(vd, valphamask); 1230 1231 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); 1232 1233 /* set the alpha to the dest alpha */ 1234 vd = vec_and(vd, vpixelmask); 1235 vd = vec_or(vd, vdstalpha); 1236 1237 /* *dstp = res */ 1238 vec_st((vector unsigned int)vd, 0, dstp); 1239 1240 srcp += 4; 1241 dstp += 4; 1242 width -= 4; 1243 vs = voverflow; 1244 } 1245 ONE_PIXEL_BLEND((extrawidth), extrawidth); 1246 } 1247 srcp += srcskip; 1248 dstp += dstskip; 1249 } 1250 #undef ONE_PIXEL_BLEND 1251 } 1252 1253 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info) 1254 { 1255 /* XXX : 6 */ 1256 unsigned alpha = info->src->alpha; 1257 int height = info->d_height; 1258 Uint32 *srcp = (Uint32 *)info->s_pixels; 1259 int srcskip = info->s_skip >> 2; 1260 Uint32 *dstp = (Uint32 *)info->d_pixels; 1261 int dstskip = info->d_skip >> 2; 1262 SDL_PixelFormat *srcfmt = info->src; 1263 SDL_PixelFormat *dstfmt = info->dst; 1264 unsigned sA = srcfmt->alpha; 1265 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; 1266 vector unsigned char mergePermute; 1267 vector unsigned char vsrcPermute; 1268 vector unsigned char vdstPermute; 1269 vector unsigned char vsdstPermute; 1270 vector unsigned char valpha; 1271 vector unsigned char valphamask; 1272 vector unsigned char vbits; 1273 vector unsigned short v1; 1274 vector unsigned short v8; 1275 1276 mergePermute = VEC_MERGE_PERMUTE(); 1277 v1 = vec_splat_u16(1); 1278 v8 = vec_splat_u16(8); 1279 1280 /* set the alpha to 255 on the destination surf */ 1281 valphamask = VEC_ALPHA_MASK(); 1282 1283 vsrcPermute = calc_swizzle32(srcfmt, NULL); 1284 vdstPermute = calc_swizzle32(NULL, dstfmt); 1285 vsdstPermute = calc_swizzle32(dstfmt, NULL); 1286 1287 /* set a vector full of alpha and 255-alpha */ 1288 ((unsigned char *)&valpha)[0] = alpha; 1289 valpha = vec_splat(valpha, 0); 1290 vbits = (vector unsigned char)vec_splat_s8(-1); 1291 1292 while(height--) { 1293 int width = info->d_width; 1294 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ 1295 Uint32 Pixel; \ 1296 unsigned sR, sG, sB, dR, dG, dB; \ 1297 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \ 1298 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \ 1299 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ 1300 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ 1301 ++srcp; \ 1302 ++dstp; \ 1303 widthvar--; \ 1304 } 1305 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); 1306 if (width > 0) { 1307 int extrawidth = (width % 4); 1308 vector unsigned char valigner = VEC_ALIGNER(srcp); 1309 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); 1310 width -= extrawidth; 1311 while (width) { 1312 vector unsigned char voverflow; 1313 vector unsigned char vd; 1314 1315 /* s = *srcp */ 1316 voverflow = (vector unsigned char)vec_ld(15, srcp); 1317 vs = vec_perm(vs, voverflow, valigner); 1318 vs = vec_perm(vs, valpha, vsrcPermute); 1319 1320 /* d = *dstp */ 1321 vd = (vector unsigned char)vec_ld(0, dstp); 1322 vd = vec_perm(vd, vd, vsdstPermute); 1323 1324 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); 1325 1326 /* set the alpha channel to full on */ 1327 vd = vec_or(vd, valphamask); 1328 vd = vec_perm(vd, vbits, vdstPermute); 1329 1330 /* *dstp = res */ 1331 vec_st((vector unsigned int)vd, 0, dstp); 1332 1333 srcp += 4; 1334 dstp += 4; 1335 width -= 4; 1336 vs = voverflow; 1337 } 1338 ONE_PIXEL_BLEND((extrawidth), extrawidth); 1339 } 1340 #undef ONE_PIXEL_BLEND 1341 1342 srcp += srcskip; 1343 dstp += dstskip; 1344 } 1345 1346 } 1347 1348 1349 /* fast RGB888->(A)RGB888 blending */ 1350 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info) 1351 { 1352 unsigned alpha = info->src->alpha; 1353 int height = info->d_height; 1354 Uint32 *srcp = (Uint32 *)info->s_pixels; 1355 int srcskip = info->s_skip >> 2; 1356 Uint32 *dstp = (Uint32 *)info->d_pixels; 1357 int dstskip = info->d_skip >> 2; 1358 vector unsigned char mergePermute; 1359 vector unsigned char valpha; 1360 vector unsigned char valphamask; 1361 vector unsigned short v1; 1362 vector unsigned short v8; 1363 1364 mergePermute = VEC_MERGE_PERMUTE(); 1365 v1 = vec_splat_u16(1); 1366 v8 = vec_splat_u16(8); 1367 1368 /* set the alpha to 255 on the destination surf */ 1369 valphamask = VEC_ALPHA_MASK(); 1370 1371 /* set a vector full of alpha and 255-alpha */ 1372 ((unsigned char *)&valpha)[0] = alpha; 1373 valpha = vec_splat(valpha, 0); 1374 1375 while(height--) { 1376 int width = info->d_width; 1377 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ 1378 Uint32 s = *srcp; \ 1379 Uint32 d = *dstp; \ 1380 Uint32 s1 = s & 0xff00ff; \ 1381 Uint32 d1 = d & 0xff00ff; \ 1382 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \ 1383 & 0xff00ff; \ 1384 s &= 0xff00; \ 1385 d &= 0xff00; \ 1386 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ 1387 *dstp = d1 | d | 0xff000000; \ 1388 ++srcp; \ 1389 ++dstp; \ 1390 widthvar--; \ 1391 } 1392 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); 1393 if (width > 0) { 1394 int extrawidth = (width % 4); 1395 vector unsigned char valigner = VEC_ALIGNER(srcp); 1396 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); 1397 width -= extrawidth; 1398 while (width) { 1399 vector unsigned char voverflow; 1400 vector unsigned char vd; 1401 1402 /* s = *srcp */ 1403 voverflow = (vector unsigned char)vec_ld(15, srcp); 1404 vs = vec_perm(vs, voverflow, valigner); 1405 1406 /* d = *dstp */ 1407 vd = (vector unsigned char)vec_ld(0, dstp); 1408 1409 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); 1410 1411 /* set the alpha channel to full on */ 1412 vd = vec_or(vd, valphamask); 1413 1414 /* *dstp = res */ 1415 vec_st((vector unsigned int)vd, 0, dstp); 1416 1417 srcp += 4; 1418 dstp += 4; 1419 width -= 4; 1420 vs = voverflow; 1421 } 1422 ONE_PIXEL_BLEND((extrawidth), extrawidth); 1423 } 1424 #undef ONE_PIXEL_BLEND 1425 1426 srcp += srcskip; 1427 dstp += dstskip; 1428 } 1429 } 1430 #if __MWERKS__ 1431 #pragma altivec_model off 1432 #endif 1433 #endif /* SDL_ALTIVEC_BLITTERS */ 1434 1435 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 1436 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) 1437 { 1438 int width = info->d_width; 1439 int height = info->d_height; 1440 Uint32 *srcp = (Uint32 *)info->s_pixels; 1441 int srcskip = info->s_skip >> 2; 1442 Uint32 *dstp = (Uint32 *)info->d_pixels; 1443 int dstskip = info->d_skip >> 2; 1444 1445 while(height--) { 1446 DUFFS_LOOP4({ 1447 Uint32 s = *srcp++; 1448 Uint32 d = *dstp; 1449 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) 1450 + (s & d & 0x00010101)) | 0xff000000; 1451 }, width); 1452 srcp += srcskip; 1453 dstp += dstskip; 1454 } 1455 } 1456 1457 /* fast RGB888->(A)RGB888 blending with surface alpha */ 1458 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info) 1459 { 1460 unsigned alpha = info->src->alpha; 1461 if(alpha == 128) { 1462 BlitRGBtoRGBSurfaceAlpha128(info); 1463 } else { 1464 int width = info->d_width; 1465 int height = info->d_height; 1466 Uint32 *srcp = (Uint32 *)info->s_pixels; 1467 int srcskip = info->s_skip >> 2; 1468 Uint32 *dstp = (Uint32 *)info->d_pixels; 1469 int dstskip = info->d_skip >> 2; 1470 Uint32 s; 1471 Uint32 d; 1472 Uint32 s1; 1473 Uint32 d1; 1474 1475 while(height--) { 1476 DUFFS_LOOP_DOUBLE2({ 1477 /* One Pixel Blend */ 1478 s = *srcp; 1479 d = *dstp; 1480 s1 = s & 0xff00ff; 1481 d1 = d & 0xff00ff; 1482 d1 = (d1 + ((s1 - d1) * alpha >> 8)) 1483 & 0xff00ff; 1484 s &= 0xff00; 1485 d &= 0xff00; 1486 d = (d + ((s - d) * alpha >> 8)) & 0xff00; 1487 *dstp = d1 | d | 0xff000000; 1488 ++srcp; 1489 ++dstp; 1490 },{ 1491 /* Two Pixels Blend */ 1492 s = *srcp; 1493 d = *dstp; 1494 s1 = s & 0xff00ff; 1495 d1 = d & 0xff00ff; 1496 d1 += (s1 - d1) * alpha >> 8; 1497 d1 &= 0xff00ff; 1498 1499 s = ((s & 0xff00) >> 8) | 1500 ((srcp[1] & 0xff00) << 8); 1501 d = ((d & 0xff00) >> 8) | 1502 ((dstp[1] & 0xff00) << 8); 1503 d += (s - d) * alpha >> 8; 1504 d &= 0x00ff00ff; 1505 1506 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000; 1507 ++srcp; 1508 1509 s1 = *srcp; 1510 d1 = *dstp; 1511 s1 &= 0xff00ff; 1512 d1 &= 0xff00ff; 1513 d1 += (s1 - d1) * alpha >> 8; 1514 d1 &= 0xff00ff; 1515 1516 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000; 1517 ++srcp; 1518 ++dstp; 1519 }, width); 1520 srcp += srcskip; 1521 dstp += dstskip; 1522 } 1523 } 1524 } 1525 1526 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ 1527 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info) 1528 { 1529 int width = info->d_width; 1530 int height = info->d_height; 1531 Uint32 *srcp = (Uint32 *)info->s_pixels; 1532 int srcskip = info->s_skip >> 2; 1533 Uint32 *dstp = (Uint32 *)info->d_pixels; 1534 int dstskip = info->d_skip >> 2; 1535 1536 while(height--) { 1537 DUFFS_LOOP4({ 1538 Uint32 dalpha; 1539 Uint32 d; 1540 Uint32 s1; 1541 Uint32 d1; 1542 Uint32 s = *srcp; 1543 Uint32 alpha = s >> 24; 1544 /* FIXME: Here we special-case opaque alpha since the 1545 compositioning used (>>8 instead of /255) doesn't handle 1546 it correctly. Also special-case alpha=0 for speed? 1547 Benchmark this! */ 1548 if(alpha) { 1549 if(alpha == SDL_ALPHA_OPAQUE) { 1550 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); 1551 } else { 1552 /* 1553 * take out the middle component (green), and process 1554 * the other two in parallel. One multiply less. 1555 */ 1556 d = *dstp; 1557 dalpha = d & 0xff000000; 1558 s1 = s & 0xff00ff; 1559 d1 = d & 0xff00ff; 1560 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; 1561 s &= 0xff00; 1562 d &= 0xff00; 1563 d = (d + ((s - d) * alpha >> 8)) & 0xff00; 1564 *dstp = d1 | d | dalpha; 1565 } 1566 } 1567 ++srcp; 1568 ++dstp; 1569 }, width); 1570 srcp += srcskip; 1571 dstp += dstskip; 1572 } 1573 } 1574 1575 #if GCC_ASMBLIT 1576 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ 1577 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) 1578 { 1579 int width = info->d_width; 1580 int height = info->d_height; 1581 Uint32 *srcp = (Uint32 *)info->s_pixels; 1582 int srcskip = info->s_skip >> 2; 1583 Uint32 *dstp = (Uint32 *)info->d_pixels; 1584 int dstskip = info->d_skip >> 2; 1585 SDL_PixelFormat* sf = info->src; 1586 Uint32 amask = sf->Amask; 1587 1588 __asm__ ( 1589 /* make mm6 all zeros. */ 1590 "pxor %%mm6, %%mm6\n" 1591 1592 /* Make a mask to preserve the alpha. */ 1593 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */ 1594 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */ 1595 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */ 1596 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */ 1597 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */ 1598 1599 /* form channel masks */ 1600 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */ 1601 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */ 1602 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */ 1603 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */ 1604 1605 /* get alpha channel shift */ 1606 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */ 1607 1608 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) ); 1609 1610 while(height--) { 1611 1612 DUFFS_LOOP4({ 1613 Uint32 alpha; 1614 1615 __asm__ ( 1616 "prefetch 64(%0)\n" 1617 "prefetch 64(%1)\n" 1618 : : "r" (srcp), "r" (dstp) ); 1619 1620 alpha = *srcp & amask; 1621 /* FIXME: Here we special-case opaque alpha since the 1622 compositioning used (>>8 instead of /255) doesn't handle 1623 it correctly. Also special-case alpha=0 for speed? 1624 Benchmark this! */ 1625 if(alpha == 0) { 1626 /* do nothing */ 1627 } 1628 else if(alpha == amask) { 1629 /* opaque alpha -- copy RGB, keep dst alpha */ 1630 /* using MMX here to free up regular registers for other things */ 1631 __asm__ ( 1632 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/ 1633 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/ 1634 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */ 1635 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */ 1636 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */ 1637 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */ 1638 1639 : : "r" (srcp), "r" (dstp) ); 1640 } 1641 1642 else { 1643 __asm__ ( 1644 /* load in the source, and dst. */ 1645 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */ 1646 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */ 1647 1648 /* Move the src alpha into mm2 */ 1649 1650 /* if supporting pshufw */ 1651 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */ 1652 /*"psrlw $8, %%mm2\n" */ 1653 1654 /* else: */ 1655 "movd %2, %%mm2\n" 1656 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */ 1657 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */ 1658 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */ 1659 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */ 1660 1661 /* move the colors into words. */ 1662 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */ 1663 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */ 1664 1665 /* src - dst */ 1666 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */ 1667 1668 /* A * (src-dst) */ 1669 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */ 1670 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */ 1671 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */ 1672 1673 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */ 1674 1675 "movd %%mm0, (%1)\n" /* result in mm0 */ 1676 1677 : : "r" (srcp), "r" (dstp), "r" (alpha) ); 1678 1679 } 1680 ++srcp; 1681 ++dstp; 1682 }, width); 1683 srcp += srcskip; 1684 dstp += dstskip; 1685 } 1686 1687 __asm__ ( 1688 "emms\n" 1689 : ); 1690 } 1691 /* End GCC_ASMBLIT*/ 1692 1693 #elif MSVC_ASMBLIT 1694 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ 1695 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) 1696 { 1697 int width = info->d_width; 1698 int height = info->d_height; 1699 Uint32 *srcp = (Uint32 *)info->s_pixels; 1700 int srcskip = info->s_skip >> 2; 1701 Uint32 *dstp = (Uint32 *)info->d_pixels; 1702 int dstskip = info->d_skip >> 2; 1703 SDL_PixelFormat* sf = info->src; 1704 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; 1705 Uint32 amask = sf->Amask; 1706 Uint32 ashift = sf->Ashift; 1707 Uint64 multmask; 1708 1709 __m64 src1, dst1, mm_alpha, mm_zero, dmask; 1710 1711 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 1712 multmask = ~(0xFFFFi64 << (ashift * 2)); 1713 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */ 1714 1715 while(height--) { 1716 DUFFS_LOOP4({ 1717 Uint32 alpha; 1718 1719 _m_prefetch(srcp + 16); 1720 _m_prefetch(dstp + 16); 1721 1722 alpha = *srcp & amask; 1723 if (alpha == 0) { 1724 /* do nothing */ 1725 } else if (alpha == amask) { 1726 /* copy RGB, keep dst alpha */ 1727 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); 1728 } else { 1729 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ 1730 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ 1731 1732 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ 1733 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ 1734 1735 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ 1736 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ 1737 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 1738 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ 1739 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ 1740 1741 /* blend */ 1742 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */ 1743 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */ 1744 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ 1745 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */ 1746 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ 1747 1748 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ 1749 } 1750 ++srcp; 1751 ++dstp; 1752 }, width); 1753 srcp += srcskip; 1754 dstp += dstskip; 1755 } 1756 _mm_empty(); 1757 } 1758 /* End MSVC_ASMBLIT */ 1759 1760 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ 1761 1762 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ 1763 1764 /* blend a single 16 bit pixel at 50% */ 1765 #define BLEND16_50(d, s, mask) \ 1766 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff))) 1767 1768 /* blend two 16 bit pixels at 50% */ 1769 #define BLEND2x16_50(d, s, mask) \ 1770 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \ 1771 + (s & d & (~(mask | mask << 16)))) 1772 1773 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask) 1774 { 1775 int width = info->d_width; 1776 int height = info->d_height; 1777 Uint16 *srcp = (Uint16 *)info->s_pixels; 1778 int srcskip = info->s_skip >> 1; 1779 Uint16 *dstp = (Uint16 *)info->d_pixels; 1780 int dstskip = info->d_skip >> 1; 1781 1782 while(height--) { 1783 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) { 1784 /* 1785 * Source and destination not aligned, pipeline it. 1786 * This is mostly a win for big blits but no loss for 1787 * small ones 1788 */ 1789 Uint32 prev_sw; 1790 int w = width; 1791 1792 /* handle odd destination */ 1793 if((uintptr_t)dstp & 2) { 1794 Uint16 d = *dstp, s = *srcp; 1795 *dstp = BLEND16_50(d, s, mask); 1796 dstp++; 1797 srcp++; 1798 w--; 1799 } 1800 srcp++; /* srcp is now 32-bit aligned */ 1801 1802 /* bootstrap pipeline with first halfword */ 1803 prev_sw = ((Uint32 *)srcp)[-1]; 1804 1805 while(w > 1) { 1806 Uint32 sw, dw, s; 1807 sw = *(Uint32 *)srcp; 1808 dw = *(Uint32 *)dstp; 1809 #if SDL_BYTEORDER == SDL_BIG_ENDIAN 1810 s = (prev_sw << 16) + (sw >> 16); 1811 #else 1812 s = (prev_sw >> 16) + (sw << 16); 1813 #endif 1814 prev_sw = sw; 1815 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask); 1816 dstp += 2; 1817 srcp += 2; 1818 w -= 2; 1819 } 1820 1821 /* final pixel if any */ 1822 if(w) { 1823 Uint16 d = *dstp, s; 1824 #if SDL_BYTEORDER == SDL_BIG_ENDIAN 1825 s = (Uint16)prev_sw; 1826 #else 1827 s = (Uint16)(prev_sw >> 16); 1828 #endif 1829 *dstp = BLEND16_50(d, s, mask); 1830 srcp++; 1831 dstp++; 1832 } 1833 srcp += srcskip - 1; 1834 dstp += dstskip; 1835 } else { 1836 /* source and destination are aligned */ 1837 int w = width; 1838 1839 /* first odd pixel? */ 1840 if((uintptr_t)srcp & 2) { 1841 Uint16 d = *dstp, s = *srcp; 1842 *dstp = BLEND16_50(d, s, mask); 1843 srcp++; 1844 dstp++; 1845 w--; 1846 } 1847 /* srcp and dstp are now 32-bit aligned */ 1848 1849 while(w > 1) { 1850 Uint32 sw = *(Uint32 *)srcp; 1851 Uint32 dw = *(Uint32 *)dstp; 1852 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask); 1853 srcp += 2; 1854 dstp += 2; 1855 w -= 2; 1856 } 1857 1858 /* last odd pixel? */ 1859 if(w) { 1860 Uint16 d = *dstp, s = *srcp; 1861 *dstp = BLEND16_50(d, s, mask); 1862 srcp++; 1863 dstp++; 1864 } 1865 srcp += srcskip; 1866 dstp += dstskip; 1867 } 1868 } 1869 } 1870 1871 #if GCC_ASMBLIT 1872 /* fast RGB565->RGB565 blending with surface alpha */ 1873 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) 1874 { 1875 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ 1876 if(alpha == 128) { 1877 Blit16to16SurfaceAlpha128(info, 0xf7de); 1878 } else { 1879 int width = info->d_width; 1880 int height = info->d_height; 1881 Uint16 *srcp = (Uint16 *)info->s_pixels; 1882 int srcskip = info->s_skip >> 1; 1883 Uint16 *dstp = (Uint16 *)info->d_pixels; 1884 int dstskip = info->d_skip >> 1; 1885 Uint32 s, d; 1886 Uint8 load[8]; 1887 1888 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ 1889 *(Uint64 *)load = alpha; 1890 alpha >>= 3; /* downscale alpha to 5 bits */ 1891 1892 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ 1893 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ 1894 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ 1895 /* position alpha to allow for mullo and mulhi on diff channels 1896 to reduce the number of operations */ 1897 psllq_i2r(3, mm0); 1898 1899 /* Setup the 565 color channel masks */ 1900 *(Uint64 *)load = 0x07E007E007E007E0ULL; 1901 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ 1902 *(Uint64 *)load = 0x001F001F001F001FULL; 1903 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ 1904 while(height--) { 1905 DUFFS_LOOP_QUATRO2( 1906 { 1907 s = *srcp++; 1908 d = *dstp; 1909 /* 1910 * shift out the middle component (green) to 1911 * the high 16 bits, and process all three RGB 1912 * components at the same time. 1913 */ 1914 s = (s | s << 16) & 0x07e0f81f; 1915 d = (d | d << 16) & 0x07e0f81f; 1916 d += (s - d) * alpha >> 5; 1917 d &= 0x07e0f81f; 1918 *dstp++ = d | d >> 16; 1919 },{ 1920 s = *srcp++; 1921 d = *dstp; 1922 /* 1923 * shift out the middle component (green) to 1924 * the high 16 bits, and process all three RGB 1925 * components at the same time. 1926 */ 1927 s = (s | s << 16) & 0x07e0f81f; 1928 d = (d | d << 16) & 0x07e0f81f; 1929 d += (s - d) * alpha >> 5; 1930 d &= 0x07e0f81f; 1931 *dstp++ = d | d >> 16; 1932 s = *srcp++; 1933 d = *dstp; 1934 /* 1935 * shift out the middle component (green) to 1936 * the high 16 bits, and process all three RGB 1937 * components at the same time. 1938 */ 1939 s = (s | s << 16) & 0x07e0f81f; 1940 d = (d | d << 16) & 0x07e0f81f; 1941 d += (s - d) * alpha >> 5; 1942 d &= 0x07e0f81f; 1943 *dstp++ = d | d >> 16; 1944 },{ 1945 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ 1946 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ 1947 1948 /* red -- does not need a mask since the right shift clears 1949 the uninteresting bits */ 1950 movq_r2r(mm2, mm5); /* src -> mm5 */ 1951 movq_r2r(mm3, mm6); /* dst -> mm6 */ 1952 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */ 1953 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */ 1954 1955 /* blend */ 1956 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 1957 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 1958 /* alpha used is actually 11 bits 1959 11 + 5 = 16 bits, so the sign bits are lost */ 1960 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ 1961 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 1962 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */ 1963 1964 movq_r2r(mm6, mm1); /* save new reds in dsts */ 1965 1966 /* green -- process the bits in place */ 1967 movq_r2r(mm2, mm5); /* src -> mm5 */ 1968 movq_r2r(mm3, mm6); /* dst -> mm6 */ 1969 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ 1970 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ 1971 1972 /* blend */ 1973 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 1974 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 1975 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting 1976 bits are gone and the sign bits present */ 1977 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ 1978 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 1979 1980 por_r2r(mm6, mm1); /* save new greens in dsts */ 1981 1982 /* blue */ 1983 movq_r2r(mm2, mm5); /* src -> mm5 */ 1984 movq_r2r(mm3, mm6); /* dst -> mm6 */ 1985 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ 1986 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ 1987 1988 /* blend */ 1989 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 1990 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 1991 /* 11 + 5 = 16 bits, so the sign bits are lost and 1992 the interesting bits will need to be MASKed */ 1993 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ 1994 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 1995 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ 1996 1997 por_r2r(mm6, mm1); /* save new blues in dsts */ 1998 1999 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */ 2000 2001 srcp += 4; 2002 dstp += 4; 2003 }, width); 2004 srcp += srcskip; 2005 dstp += dstskip; 2006 } 2007 emms(); 2008 } 2009 } 2010 2011 /* fast RGB555->RGB555 blending with surface alpha */ 2012 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) 2013 { 2014 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ 2015 if(alpha == 128) { 2016 Blit16to16SurfaceAlpha128(info, 0xfbde); 2017 } else { 2018 int width = info->d_width; 2019 int height = info->d_height; 2020 Uint16 *srcp = (Uint16 *)info->s_pixels; 2021 int srcskip = info->s_skip >> 1; 2022 Uint16 *dstp = (Uint16 *)info->d_pixels; 2023 int dstskip = info->d_skip >> 1; 2024 Uint32 s, d; 2025 Uint8 load[8]; 2026 2027 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ 2028 *(Uint64 *)load = alpha; 2029 alpha >>= 3; /* downscale alpha to 5 bits */ 2030 2031 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ 2032 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ 2033 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ 2034 /* position alpha to allow for mullo and mulhi on diff channels 2035 to reduce the number of operations */ 2036 psllq_i2r(3, mm0); 2037 2038 /* Setup the 555 color channel masks */ 2039 *(Uint64 *)load = 0x03E003E003E003E0ULL; 2040 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ 2041 *(Uint64 *)load = 0x001F001F001F001FULL; 2042 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ 2043 while(height--) { 2044 DUFFS_LOOP_QUATRO2( 2045 { 2046 s = *srcp++; 2047 d = *dstp; 2048 /* 2049 * shift out the middle component (green) to 2050 * the high 16 bits, and process all three RGB 2051 * components at the same time. 2052 */ 2053 s = (s | s << 16) & 0x03e07c1f; 2054 d = (d | d << 16) & 0x03e07c1f; 2055 d += (s - d) * alpha >> 5; 2056 d &= 0x03e07c1f; 2057 *dstp++ = d | d >> 16; 2058 },{ 2059 s = *srcp++; 2060 d = *dstp; 2061 /* 2062 * shift out the middle component (green) to 2063 * the high 16 bits, and process all three RGB 2064 * components at the same time. 2065 */ 2066 s = (s | s << 16) & 0x03e07c1f; 2067 d = (d | d << 16) & 0x03e07c1f; 2068 d += (s - d) * alpha >> 5; 2069 d &= 0x03e07c1f; 2070 *dstp++ = d | d >> 16; 2071 s = *srcp++; 2072 d = *dstp; 2073 /* 2074 * shift out the middle component (green) to 2075 * the high 16 bits, and process all three RGB 2076 * components at the same time. 2077 */ 2078 s = (s | s << 16) & 0x03e07c1f; 2079 d = (d | d << 16) & 0x03e07c1f; 2080 d += (s - d) * alpha >> 5; 2081 d &= 0x03e07c1f; 2082 *dstp++ = d | d >> 16; 2083 },{ 2084 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ 2085 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ 2086 2087 /* red -- process the bits in place */ 2088 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */ 2089 /* by reusing the GREEN mask we free up another mmx 2090 register to accumulate the result */ 2091 2092 movq_r2r(mm2, mm5); /* src -> mm5 */ 2093 movq_r2r(mm3, mm6); /* dst -> mm6 */ 2094 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */ 2095 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */ 2096 2097 /* blend */ 2098 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 2099 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 2100 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be 2101 cleared by a MASK below */ 2102 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ 2103 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 2104 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */ 2105 2106 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */ 2107 2108 movq_r2r(mm6, mm1); /* save new reds in dsts */ 2109 2110 /* green -- process the bits in place */ 2111 movq_r2r(mm2, mm5); /* src -> mm5 */ 2112 movq_r2r(mm3, mm6); /* dst -> mm6 */ 2113 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ 2114 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ 2115 2116 /* blend */ 2117 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 2118 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 2119 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting 2120 bits are gone and the sign bits present */ 2121 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ 2122 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 2123 2124 por_r2r(mm6, mm1); /* save new greens in dsts */ 2125 2126 /* blue */ 2127 movq_r2r(mm2, mm5); /* src -> mm5 */ 2128 movq_r2r(mm3, mm6); /* dst -> mm6 */ 2129 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ 2130 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ 2131 2132 /* blend */ 2133 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 2134 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 2135 /* 11 + 5 = 16 bits, so the sign bits are lost and 2136 the interesting bits will need to be MASKed */ 2137 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ 2138 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 2139 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ 2140 2141 por_r2r(mm6, mm1); /* save new blues in dsts */ 2142 2143 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */ 2144 2145 srcp += 4; 2146 dstp += 4; 2147 }, width); 2148 srcp += srcskip; 2149 dstp += dstskip; 2150 } 2151 emms(); 2152 } 2153 } 2154 /* End GCC_ASMBLIT */ 2155 2156 #elif MSVC_ASMBLIT 2157 /* fast RGB565->RGB565 blending with surface alpha */ 2158 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) 2159 { 2160 unsigned alpha = info->src->alpha; 2161 if(alpha == 128) { 2162 Blit16to16SurfaceAlpha128(info, 0xf7de); 2163 } else { 2164 int width = info->d_width; 2165 int height = info->d_height; 2166 Uint16 *srcp = (Uint16 *)info->s_pixels; 2167 int srcskip = info->s_skip >> 1; 2168 Uint16 *dstp = (Uint16 *)info->d_pixels; 2169 int dstskip = info->d_skip >> 1; 2170 Uint32 s, d; 2171 2172 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha; 2173 2174 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ 2175 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ 2176 alpha >>= 3; /* downscale alpha to 5 bits */ 2177 2178 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 2179 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ 2180 /* position alpha to allow for mullo and mulhi on diff channels 2181 to reduce the number of operations */ 2182 mm_alpha = _mm_slli_si64(mm_alpha, 3); 2183 2184 /* Setup the 565 color channel masks */ 2185 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */ 2186 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ 2187 2188 while(height--) { 2189 DUFFS_LOOP_QUATRO2( 2190 { 2191 s = *srcp++; 2192 d = *dstp; 2193 /* 2194 * shift out the middle component (green) to 2195 * the high 16 bits, and process all three RGB 2196 * components at the same time. 2197 */ 2198 s = (s | s << 16) & 0x07e0f81f; 2199 d = (d | d << 16) & 0x07e0f81f; 2200 d += (s - d) * alpha >> 5; 2201 d &= 0x07e0f81f; 2202 *dstp++ = (Uint16)(d | d >> 16); 2203 },{ 2204 s = *srcp++; 2205 d = *dstp; 2206 /* 2207 * shift out the middle component (green) to 2208 * the high 16 bits, and process all three RGB 2209 * components at the same time. 2210 */ 2211 s = (s | s << 16) & 0x07e0f81f; 2212 d = (d | d << 16) & 0x07e0f81f; 2213 d += (s - d) * alpha >> 5; 2214 d &= 0x07e0f81f; 2215 *dstp++ = (Uint16)(d | d >> 16); 2216 s = *srcp++; 2217 d = *dstp; 2218 /* 2219 * shift out the middle component (green) to 2220 * the high 16 bits, and process all three RGB 2221 * components at the same time. 2222 */ 2223 s = (s | s << 16) & 0x07e0f81f; 2224 d = (d | d << 16) & 0x07e0f81f; 2225 d += (s - d) * alpha >> 5; 2226 d &= 0x07e0f81f; 2227 *dstp++ = (Uint16)(d | d >> 16); 2228 },{ 2229 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ 2230 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ 2231 2232 /* red */ 2233 src2 = src1; 2234 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */ 2235 2236 dst2 = dst1; 2237 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */ 2238 2239 /* blend */ 2240 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2241 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2242 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ 2243 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2244 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */ 2245 2246 mm_res = dst2; /* RED -> mm_res */ 2247 2248 /* green -- process the bits in place */ 2249 src2 = src1; 2250 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ 2251 2252 dst2 = dst1; 2253 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ 2254 2255 /* blend */ 2256 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2257 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2258 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ 2259 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2260 2261 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ 2262 2263 /* blue */ 2264 src2 = src1; 2265 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ 2266 2267 dst2 = dst1; 2268 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ 2269 2270 /* blend */ 2271 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2272 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2273 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ 2274 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2275 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ 2276 2277 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ 2278 2279 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ 2280 2281 srcp += 4; 2282 dstp += 4; 2283 }, width); 2284 srcp += srcskip; 2285 dstp += dstskip; 2286 } 2287 _mm_empty(); 2288 } 2289 } 2290 2291 /* fast RGB555->RGB555 blending with surface alpha */ 2292 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) 2293 { 2294 unsigned alpha = info->src->alpha; 2295 if(alpha == 128) { 2296 Blit16to16SurfaceAlpha128(info, 0xfbde); 2297 } else { 2298 int width = info->d_width; 2299 int height = info->d_height; 2300 Uint16 *srcp = (Uint16 *)info->s_pixels; 2301 int srcskip = info->s_skip >> 1; 2302 Uint16 *dstp = (Uint16 *)info->d_pixels; 2303 int dstskip = info->d_skip >> 1; 2304 Uint32 s, d; 2305 2306 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha; 2307 2308 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ 2309 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ 2310 alpha >>= 3; /* downscale alpha to 5 bits */ 2311 2312 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 2313 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ 2314 /* position alpha to allow for mullo and mulhi on diff channels 2315 to reduce the number of operations */ 2316 mm_alpha = _mm_slli_si64(mm_alpha, 3); 2317 2318 /* Setup the 555 color channel masks */ 2319 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */ 2320 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */ 2321 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ 2322 2323 while(height--) { 2324 DUFFS_LOOP_QUATRO2( 2325 { 2326 s = *srcp++; 2327 d = *dstp; 2328 /* 2329 * shift out the middle component (green) to 2330 * the high 16 bits, and process all three RGB 2331 * components at the same time. 2332 */ 2333 s = (s | s << 16) & 0x03e07c1f; 2334 d = (d | d << 16) & 0x03e07c1f; 2335 d += (s - d) * alpha >> 5; 2336 d &= 0x03e07c1f; 2337 *dstp++ = (Uint16)(d | d >> 16); 2338 },{ 2339 s = *srcp++; 2340 d = *dstp; 2341 /* 2342 * shift out the middle component (green) to 2343 * the high 16 bits, and process all three RGB 2344 * components at the same time. 2345 */ 2346 s = (s | s << 16) & 0x03e07c1f; 2347 d = (d | d << 16) & 0x03e07c1f; 2348 d += (s - d) * alpha >> 5; 2349 d &= 0x03e07c1f; 2350 *dstp++ = (Uint16)(d | d >> 16); 2351 s = *srcp++; 2352 d = *dstp; 2353 /* 2354 * shift out the middle component (green) to 2355 * the high 16 bits, and process all three RGB 2356 * components at the same time. 2357 */ 2358 s = (s | s << 16) & 0x03e07c1f; 2359 d = (d | d << 16) & 0x03e07c1f; 2360 d += (s - d) * alpha >> 5; 2361 d &= 0x03e07c1f; 2362 *dstp++ = (Uint16)(d | d >> 16); 2363 },{ 2364 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ 2365 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ 2366 2367 /* red -- process the bits in place */ 2368 src2 = src1; 2369 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */ 2370 2371 dst2 = dst1; 2372 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */ 2373 2374 /* blend */ 2375 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2376 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2377 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ 2378 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2379 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */ 2380 2381 mm_res = dst2; /* RED -> mm_res */ 2382 2383 /* green -- process the bits in place */ 2384 src2 = src1; 2385 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ 2386 2387 dst2 = dst1; 2388 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ 2389 2390 /* blend */ 2391 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2392 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2393 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ 2394 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2395 2396 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ 2397 2398 /* blue */ 2399 src2 = src1; /* src -> src2 */ 2400 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ 2401 2402 dst2 = dst1; /* dst -> dst2 */ 2403 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ 2404 2405 /* blend */ 2406 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 2407 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 2408 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ 2409 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 2410 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ 2411 2412 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ 2413 2414 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ 2415 2416 srcp += 4; 2417 dstp += 4; 2418 }, width); 2419 srcp += srcskip; 2420 dstp += dstskip; 2421 } 2422 _mm_empty(); 2423 } 2424 } 2425 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ 2426 2427 /* fast RGB565->RGB565 blending with surface alpha */ 2428 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) 2429 { 2430 unsigned alpha = info->src->alpha; 2431 if(alpha == 128) { 2432 Blit16to16SurfaceAlpha128(info, 0xf7de); 2433 } else { 2434 int width = info->d_width; 2435 int height = info->d_height; 2436 Uint16 *srcp = (Uint16 *)info->s_pixels; 2437 int srcskip = info->s_skip >> 1; 2438 Uint16 *dstp = (Uint16 *)info->d_pixels; 2439 int dstskip = info->d_skip >> 1; 2440 alpha >>= 3; /* downscale alpha to 5 bits */ 2441 2442 while(height--) { 2443 DUFFS_LOOP4({ 2444 Uint32 s = *srcp++; 2445 Uint32 d = *dstp; 2446 /* 2447 * shift out the middle component (green) to 2448 * the high 16 bits, and process all three RGB 2449 * components at the same time. 2450 */ 2451 s = (s | s << 16) & 0x07e0f81f; 2452 d = (d | d << 16) & 0x07e0f81f; 2453 d += (s - d) * alpha >> 5; 2454 d &= 0x07e0f81f; 2455 *dstp++ = (Uint16)(d | d >> 16); 2456 }, width); 2457 srcp += srcskip; 2458 dstp += dstskip; 2459 } 2460 } 2461 } 2462 2463 /* fast RGB555->RGB555 blending with surface alpha */ 2464 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info) 2465 { 2466 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ 2467 if(alpha == 128) { 2468 Blit16to16SurfaceAlpha128(info, 0xfbde); 2469 } else { 2470 int width = info->d_width; 2471 int height = info->d_height; 2472 Uint16 *srcp = (Uint16 *)info->s_pixels; 2473 int srcskip = info->s_skip >> 1; 2474 Uint16 *dstp = (Uint16 *)info->d_pixels; 2475 int dstskip = info->d_skip >> 1; 2476 alpha >>= 3; /* downscale alpha to 5 bits */ 2477 2478 while(height--) { 2479 DUFFS_LOOP4({ 2480 Uint32 s = *srcp++; 2481 Uint32 d = *dstp; 2482 /* 2483 * shift out the middle component (green) to 2484 * the high 16 bits, and process all three RGB 2485 * components at the same time. 2486 */ 2487 s = (s | s << 16) & 0x03e07c1f; 2488 d = (d | d << 16) & 0x03e07c1f; 2489 d += (s - d) * alpha >> 5; 2490 d &= 0x03e07c1f; 2491 *dstp++ = (Uint16)(d | d >> 16); 2492 }, width); 2493 srcp += srcskip; 2494 dstp += dstskip; 2495 } 2496 } 2497 } 2498 2499 /* fast ARGB8888->RGB565 blending with pixel alpha */ 2500 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info) 2501 { 2502 int width = info->d_width; 2503 int height = info->d_height; 2504 Uint32 *srcp = (Uint32 *)info->s_pixels; 2505 int srcskip = info->s_skip >> 2; 2506 Uint16 *dstp = (Uint16 *)info->d_pixels; 2507 int dstskip = info->d_skip >> 1; 2508 2509 while(height--) { 2510 DUFFS_LOOP4({ 2511 Uint32 s = *srcp; 2512 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */ 2513 /* FIXME: Here we special-case opaque alpha since the 2514 compositioning used (>>8 instead of /255) doesn't handle 2515 it correctly. Also special-case alpha=0 for speed? 2516 Benchmark this! */ 2517 if(alpha) { 2518 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { 2519 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f)); 2520 } else { 2521 Uint32 d = *dstp; 2522 /* 2523 * convert source and destination to G0RAB65565 2524 * and blend all components at the same time 2525 */ 2526 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) 2527 + (s >> 3 & 0x1f); 2528 d = (d | d << 16) & 0x07e0f81f; 2529 d += (s - d) * alpha >> 5; 2530 d &= 0x07e0f81f; 2531 *dstp = (Uint16)(d | d >> 16); 2532 } 2533 } 2534 srcp++; 2535 dstp++; 2536 }, width); 2537 srcp += srcskip; 2538 dstp += dstskip; 2539 } 2540 } 2541 2542 /* fast ARGB8888->RGB555 blending with pixel alpha */ 2543 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info) 2544 { 2545 int width = info->d_width; 2546 int height = info->d_height; 2547 Uint32 *srcp = (Uint32 *)info->s_pixels; 2548 int srcskip = info->s_skip >> 2; 2549 Uint16 *dstp = (Uint16 *)info->d_pixels; 2550 int dstskip = info->d_skip >> 1; 2551 2552 while(height--) { 2553 DUFFS_LOOP4({ 2554 unsigned alpha; 2555 Uint32 s = *srcp; 2556 alpha = s >> 27; /* downscale alpha to 5 bits */ 2557 /* FIXME: Here we special-case opaque alpha since the 2558 compositioning used (>>8 instead of /255) doesn't handle 2559 it correctly. Also special-case alpha=0 for speed? 2560 Benchmark this! */ 2561 if(alpha) { 2562 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { 2563 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f)); 2564 } else { 2565 Uint32 d = *dstp; 2566 /* 2567 * convert source and destination to G0RAB65565 2568 * and blend all components at the same time 2569 */ 2570 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) 2571 + (s >> 3 & 0x1f); 2572 d = (d | d << 16) & 0x03e07c1f; 2573 d += (s - d) * alpha >> 5; 2574 d &= 0x03e07c1f; 2575 *dstp = (Uint16)(d | d >> 16); 2576 } 2577 } 2578 srcp++; 2579 dstp++; 2580 }, width); 2581 srcp += srcskip; 2582 dstp += dstskip; 2583 } 2584 } 2585 2586 /* General (slow) N->N blending with per-surface alpha */ 2587 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info) 2588 { 2589 int width = info->d_width; 2590 int height = info->d_height; 2591 Uint8 *src = info->s_pixels; 2592 int srcskip = info->s_skip; 2593 Uint8 *dst = info->d_pixels; 2594 int dstskip = info->d_skip; 2595 SDL_PixelFormat *srcfmt = info->src; 2596 SDL_PixelFormat *dstfmt = info->dst; 2597 int srcbpp = srcfmt->BytesPerPixel; 2598 int dstbpp = dstfmt->BytesPerPixel; 2599 unsigned sA = srcfmt->alpha; 2600 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; 2601 2602 if(sA) { 2603 while ( height-- ) { 2604 DUFFS_LOOP4( 2605 { 2606 Uint32 Pixel; 2607 unsigned sR; 2608 unsigned sG; 2609 unsigned sB; 2610 unsigned dR; 2611 unsigned dG; 2612 unsigned dB; 2613 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 2614 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB); 2615 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); 2616 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 2617 src += srcbpp; 2618 dst += dstbpp; 2619 }, 2620 width); 2621 src += srcskip; 2622 dst += dstskip; 2623 } 2624 } 2625 } 2626 2627 /* General (slow) colorkeyed N->N blending with per-surface alpha */ 2628 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info) 2629 { 2630 int width = info->d_width; 2631 int height = info->d_height; 2632 Uint8 *src = info->s_pixels; 2633 int srcskip = info->s_skip; 2634 Uint8 *dst = info->d_pixels; 2635 int dstskip = info->d_skip; 2636 SDL_PixelFormat *srcfmt = info->src; 2637 SDL_PixelFormat *dstfmt = info->dst; 2638 Uint32 ckey = srcfmt->colorkey; 2639 int srcbpp = srcfmt->BytesPerPixel; 2640 int dstbpp = dstfmt->BytesPerPixel; 2641 unsigned sA = srcfmt->alpha; 2642 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; 2643 2644 while ( height-- ) { 2645 DUFFS_LOOP4( 2646 { 2647 Uint32 Pixel; 2648 unsigned sR; 2649 unsigned sG; 2650 unsigned sB; 2651 unsigned dR; 2652 unsigned dG; 2653 unsigned dB; 2654 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel); 2655 if(sA && Pixel != ckey) { 2656 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); 2657 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB); 2658 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); 2659 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 2660 } 2661 src += srcbpp; 2662 dst += dstbpp; 2663 }, 2664 width); 2665 src += srcskip; 2666 dst += dstskip; 2667 } 2668 } 2669 2670 /* General (slow) N->N blending with pixel alpha */ 2671 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) 2672 { 2673 int width = info->d_width; 2674 int height = info->d_height; 2675 Uint8 *src = info->s_pixels; 2676 int srcskip = info->s_skip; 2677 Uint8 *dst = info->d_pixels; 2678 int dstskip = info->d_skip; 2679 SDL_PixelFormat *srcfmt = info->src; 2680 SDL_PixelFormat *dstfmt = info->dst; 2681 2682 int srcbpp; 2683 int dstbpp; 2684 2685 /* Set up some basic variables */ 2686 srcbpp = srcfmt->BytesPerPixel; 2687 dstbpp = dstfmt->BytesPerPixel; 2688 2689 /* FIXME: for 8bpp source alpha, this doesn't get opaque values 2690 quite right. for <8bpp source alpha, it gets them very wrong 2691 (check all macros!) 2692 It is unclear whether there is a good general solution that doesn't 2693 need a branch (or a divide). */ 2694 while ( height-- ) { 2695 DUFFS_LOOP4( 2696 { 2697 Uint32 Pixel; 2698 unsigned sR; 2699 unsigned sG; 2700 unsigned sB; 2701 unsigned dR; 2702 unsigned dG; 2703 unsigned dB; 2704 unsigned sA; 2705 unsigned dA; 2706 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); 2707 if(sA) { 2708 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); 2709 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); 2710 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 2711 } 2712 src += srcbpp; 2713 dst += dstbpp; 2714 }, 2715 width); 2716 src += srcskip; 2717 dst += dstskip; 2718 } 2719 } 2720 2721 2722 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index) 2723 { 2724 SDL_PixelFormat *sf = surface->format; 2725 SDL_PixelFormat *df = surface->map->dst->format; 2726 2727 if(sf->Amask == 0) { 2728 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) { 2729 if(df->BytesPerPixel == 1) 2730 return BlitNto1SurfaceAlphaKey; 2731 else 2732 #if SDL_ALTIVEC_BLITTERS 2733 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 && 2734 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) 2735 return Blit32to32SurfaceAlphaKeyAltivec; 2736 else 2737 #endif 2738 return BlitNtoNSurfaceAlphaKey; 2739 } else { 2740 /* Per-surface alpha blits */ 2741 switch(df->BytesPerPixel) { 2742 case 1: 2743 return BlitNto1SurfaceAlpha; 2744 2745 case 2: 2746 if(surface->map->identity) { 2747 if(df->Gmask == 0x7e0) 2748 { 2749 #if MMX_ASMBLIT 2750 if(SDL_HasMMX()) 2751 return Blit565to565SurfaceAlphaMMX; 2752 else 2753 #endif 2754 return Blit565to565SurfaceAlpha; 2755 } 2756 else if(df->Gmask == 0x3e0) 2757 { 2758 #if MMX_ASMBLIT 2759 if(SDL_HasMMX()) 2760 return Blit555to555SurfaceAlphaMMX; 2761 else 2762 #endif 2763 return Blit555to555SurfaceAlpha; 2764 } 2765 } 2766 return BlitNtoNSurfaceAlpha; 2767 2768 case 4: 2769 if(sf->Rmask == df->Rmask 2770 && sf->Gmask == df->Gmask 2771 && sf->Bmask == df->Bmask 2772 && sf->BytesPerPixel == 4) 2773 { 2774 #if MMX_ASMBLIT 2775 if(sf->Rshift % 8 == 0 2776 && sf->Gshift % 8 == 0 2777 && sf->Bshift % 8 == 0 2778 && SDL_HasMMX()) 2779 return BlitRGBtoRGBSurfaceAlphaMMX; 2780 #endif 2781 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) 2782 { 2783 #if SDL_ALTIVEC_BLITTERS 2784 if(!(surface->map->dst->flags & SDL_HWSURFACE) 2785 && SDL_HasAltiVec()) 2786 return BlitRGBtoRGBSurfaceAlphaAltivec; 2787 #endif 2788 return BlitRGBtoRGBSurfaceAlpha; 2789 } 2790 } 2791 #if SDL_ALTIVEC_BLITTERS 2792 if((sf->BytesPerPixel == 4) && 2793 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) 2794 return Blit32to32SurfaceAlphaAltivec; 2795 else 2796 #endif 2797 return BlitNtoNSurfaceAlpha; 2798 2799 case 3: 2800 default: 2801 return BlitNtoNSurfaceAlpha; 2802 } 2803 } 2804 } else { 2805 /* Per-pixel alpha blits */ 2806 switch(df->BytesPerPixel) { 2807 case 1: 2808 return BlitNto1PixelAlpha; 2809 2810 case 2: 2811 #if SDL_ALTIVEC_BLITTERS 2812 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) && 2813 df->Gmask == 0x7e0 && 2814 df->Bmask == 0x1f && SDL_HasAltiVec()) 2815 return Blit32to565PixelAlphaAltivec; 2816 else 2817 #endif 2818 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 2819 && sf->Gmask == 0xff00 2820 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) 2821 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { 2822 if(df->Gmask == 0x7e0) 2823 return BlitARGBto565PixelAlpha; 2824 else if(df->Gmask == 0x3e0) 2825 return BlitARGBto555PixelAlpha; 2826 } 2827 return BlitNtoNPixelAlpha; 2828 2829 case 4: 2830 if(sf->Rmask == df->Rmask 2831 && sf->Gmask == df->Gmask 2832 && sf->Bmask == df->Bmask 2833 && sf->BytesPerPixel == 4) 2834 { 2835 #if MMX_ASMBLIT 2836 if(sf->Rshift % 8 == 0 2837 && sf->Gshift % 8 == 0 2838 && sf->Bshift % 8 == 0 2839 && sf->Ashift % 8 == 0 2840 && sf->Aloss == 0) 2841 { 2842 if(SDL_Has3DNow()) 2843 return BlitRGBtoRGBPixelAlphaMMX3DNOW; 2844 if(SDL_HasMMX()) 2845 return BlitRGBtoRGBPixelAlphaMMX; 2846 } 2847 #endif 2848 if(sf->Amask == 0xff000000) 2849 { 2850 #if SDL_ALTIVEC_BLITTERS 2851 if(!(surface->map->dst->flags & SDL_HWSURFACE) 2852 && SDL_HasAltiVec()) 2853 return BlitRGBtoRGBPixelAlphaAltivec; 2854 #endif 2855 return BlitRGBtoRGBPixelAlpha; 2856 } 2857 } 2858 #if SDL_ALTIVEC_BLITTERS 2859 if (sf->Amask && sf->BytesPerPixel == 4 && 2860 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) 2861 return Blit32to32PixelAlphaAltivec; 2862 else 2863 #endif 2864 return BlitNtoNPixelAlpha; 2865 2866 case 3: 2867 default: 2868 return BlitNtoNPixelAlpha; 2869 } 2870 } 2871 } 2872 2873