1 /* 2 * Copyright 2004, 2005 Red Hat, Inc. 3 * Copyright 2004 Nicholas Miell 4 * Copyright 2005 Trolltech AS 5 * 6 * Permission to use, copy, modify, distribute, and sell this software and its 7 * documentation for any purpose is hereby granted without fee, provided that 8 * the above copyright notice appear in all copies and that both that 9 * copyright notice and this permission notice appear in supporting 10 * documentation, and that the name of Red Hat not be used in advertising or 11 * publicity pertaining to distribution of the software without specific, 12 * written prior permission. Red Hat makes no representations about the 13 * suitability of this software for any purpose. It is provided "as is" 14 * without express or implied warranty. 15 * 16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 23 * SOFTWARE. 24 * 25 * Author: Sren Sandmann (sandmann (at) redhat.com) 26 * Minor Improvements: Nicholas Miell (nmiell (at) gmail.com) 27 * MMX code paths for fbcompose.c by Lars Knoll (lars (at) trolltech.com) 28 * 29 * Based on work by Owen Taylor 30 */ 31 32 #ifdef HAVE_CONFIG_H 33 #include <config.h> 34 #endif 35 36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI 37 38 #ifdef USE_LOONGSON_MMI 39 #include <loongson-mmintrin.h> 40 #else 41 #include <mmintrin.h> 42 #endif 43 #include "pixman-private.h" 44 #include "pixman-combine32.h" 45 #include "pixman-inlines.h" 46 47 #ifdef VERBOSE 48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__) 49 #else 50 #define CHECKPOINT() 51 #endif 52 53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8 54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */ 55 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 56 _mm_empty (void) 57 { 58 59 } 60 #endif 61 62 #ifdef USE_X86_MMX 63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) 64 # include <xmmintrin.h> 65 # else 66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE 67 * instructions to be generated that we don't want. Just duplicate the 68 * functions we want to use. */ 69 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 70 _mm_movemask_pi8 (__m64 __A) 71 { 72 int ret; 73 74 asm ("pmovmskb %1, %0\n\t" 75 : "=r" (ret) 76 : "y" (__A) 77 ); 78 79 return ret; 80 } 81 82 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 83 _mm_mulhi_pu16 (__m64 __A, __m64 __B) 84 { 85 asm ("pmulhuw %1, %0\n\t" 86 : "+y" (__A) 87 : "y" (__B) 88 ); 89 return __A; 90 } 91 92 # ifdef __OPTIMIZE__ 93 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 94 _mm_shuffle_pi16 (__m64 __A, int8_t const __N) 95 { 96 __m64 ret; 97 98 asm ("pshufw %2, %1, %0\n\t" 99 : "=y" (ret) 100 : "y" (__A), "K" (__N) 101 ); 102 103 return ret; 104 } 105 # else 106 # define _mm_shuffle_pi16(A, N) \ 107 ({ \ 108 __m64 ret; \ 109 \ 110 asm ("pshufw %2, %1, %0\n\t" \ 111 : "=y" (ret) \ 112 : "y" (A), "K" ((const int8_t)N) \ 113 ); \ 114 \ 115 ret; \ 116 }) 117 # endif 118 # endif 119 #endif 120 121 #ifndef _MSC_VER 122 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 123 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 124 #endif 125 126 /* Notes about writing mmx code 127 * 128 * give memory operands as the second operand. If you give it as the 129 * first, gcc will first load it into a register, then use that 130 * register 131 * 132 * ie. use 133 * 134 * _mm_mullo_pi16 (x, mmx_constant); 135 * 136 * not 137 * 138 * _mm_mullo_pi16 (mmx_constant, x); 139 * 140 * Also try to minimize dependencies. i.e. when you need a value, try 141 * to calculate it from a value that was calculated as early as 142 * possible. 143 */ 144 145 /* --------------- MMX primitives ------------------------------------- */ 146 147 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be 148 * the name of the member used to access the data. 149 * If __m64 requires using mm_cvt* intrinsics functions to convert between 150 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS. 151 * If __m64 and uint64_t values can just be cast to each other directly, 152 * then define USE_M64_CASTS. 153 * If __m64 is a double datatype, then define USE_M64_DOUBLE. 154 */ 155 #ifdef _MSC_VER 156 # define M64_MEMBER m64_u64 157 #elif defined(__ICC) 158 # define USE_CVT_INTRINSICS 159 #elif defined(USE_LOONGSON_MMI) 160 # define USE_M64_DOUBLE 161 #elif defined(__GNUC__) 162 # define USE_M64_CASTS 163 #elif defined(__SUNPRO_C) 164 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__) 165 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__) 166 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__ 167 * is defined. If it is used, then the mm_cvt* intrinsics must be used. 168 */ 169 # define USE_CVT_INTRINSICS 170 # else 171 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is 172 * disabled, __m64 is defined as a struct containing "unsigned long long l_". 173 */ 174 # define M64_MEMBER l_ 175 # endif 176 #endif 177 178 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE) 179 typedef uint64_t mmxdatafield; 180 #else 181 typedef __m64 mmxdatafield; 182 #endif 183 184 typedef struct 185 { 186 mmxdatafield mmx_4x00ff; 187 mmxdatafield mmx_4x0080; 188 mmxdatafield mmx_565_rgb; 189 mmxdatafield mmx_565_unpack_multiplier; 190 mmxdatafield mmx_565_pack_multiplier; 191 mmxdatafield mmx_565_r; 192 mmxdatafield mmx_565_g; 193 mmxdatafield mmx_565_b; 194 mmxdatafield mmx_packed_565_rb; 195 mmxdatafield mmx_packed_565_g; 196 mmxdatafield mmx_expand_565_g; 197 mmxdatafield mmx_expand_565_b; 198 mmxdatafield mmx_expand_565_r; 199 #ifndef USE_LOONGSON_MMI 200 mmxdatafield mmx_mask_0; 201 mmxdatafield mmx_mask_1; 202 mmxdatafield mmx_mask_2; 203 mmxdatafield mmx_mask_3; 204 #endif 205 mmxdatafield mmx_full_alpha; 206 mmxdatafield mmx_4x0101; 207 mmxdatafield mmx_ff000000; 208 } mmx_data_t; 209 210 #if defined(_MSC_VER) 211 # define MMXDATA_INIT(field, val) { val ## UI64 } 212 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */ 213 # define MMXDATA_INIT(field, val) field = { val ## ULL } 214 #else /* mmxdatafield is an integral type */ 215 # define MMXDATA_INIT(field, val) field = val ## ULL 216 #endif 217 218 static const mmx_data_t c = 219 { 220 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff), 221 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080), 222 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f), 223 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840), 224 MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004), 225 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000), 226 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000), 227 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), 228 MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8), 229 MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00), 230 MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0), 231 MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f), 232 MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800), 233 #ifndef USE_LOONGSON_MMI 234 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), 235 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), 236 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff), 237 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff), 238 #endif 239 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000), 240 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101), 241 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000), 242 }; 243 244 #ifdef USE_CVT_INTRINSICS 245 # define MC(x) to_m64 (c.mmx_ ## x) 246 #elif defined(USE_M64_CASTS) 247 # define MC(x) ((__m64)c.mmx_ ## x) 248 #elif defined(USE_M64_DOUBLE) 249 # define MC(x) (*(__m64 *)&c.mmx_ ## x) 250 #else 251 # define MC(x) c.mmx_ ## x 252 #endif 253 254 static force_inline __m64 255 to_m64 (uint64_t x) 256 { 257 #ifdef USE_CVT_INTRINSICS 258 return _mm_cvtsi64_m64 (x); 259 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ 260 __m64 res; 261 262 res.M64_MEMBER = x; 263 return res; 264 #elif defined USE_M64_DOUBLE 265 return *(__m64 *)&x; 266 #else /* USE_M64_CASTS */ 267 return (__m64)x; 268 #endif 269 } 270 271 static force_inline uint64_t 272 to_uint64 (__m64 x) 273 { 274 #ifdef USE_CVT_INTRINSICS 275 return _mm_cvtm64_si64 (x); 276 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ 277 uint64_t res = x.M64_MEMBER; 278 return res; 279 #elif defined USE_M64_DOUBLE 280 return *(uint64_t *)&x; 281 #else /* USE_M64_CASTS */ 282 return (uint64_t)x; 283 #endif 284 } 285 286 static force_inline __m64 287 shift (__m64 v, 288 int s) 289 { 290 if (s > 0) 291 return _mm_slli_si64 (v, s); 292 else if (s < 0) 293 return _mm_srli_si64 (v, -s); 294 else 295 return v; 296 } 297 298 static force_inline __m64 299 negate (__m64 mask) 300 { 301 return _mm_xor_si64 (mask, MC (4x00ff)); 302 } 303 304 static force_inline __m64 305 pix_multiply (__m64 a, __m64 b) 306 { 307 __m64 res; 308 309 res = _mm_mullo_pi16 (a, b); 310 res = _mm_adds_pu16 (res, MC (4x0080)); 311 res = _mm_mulhi_pu16 (res, MC (4x0101)); 312 313 return res; 314 } 315 316 static force_inline __m64 317 pix_add (__m64 a, __m64 b) 318 { 319 return _mm_adds_pu8 (a, b); 320 } 321 322 static force_inline __m64 323 expand_alpha (__m64 pixel) 324 { 325 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3)); 326 } 327 328 static force_inline __m64 329 expand_alpha_rev (__m64 pixel) 330 { 331 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0)); 332 } 333 334 static force_inline __m64 335 invert_colors (__m64 pixel) 336 { 337 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2)); 338 } 339 340 static force_inline __m64 341 over (__m64 src, 342 __m64 srca, 343 __m64 dest) 344 { 345 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca))); 346 } 347 348 static force_inline __m64 349 over_rev_non_pre (__m64 src, __m64 dest) 350 { 351 __m64 srca = expand_alpha (src); 352 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha)); 353 354 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest); 355 } 356 357 static force_inline __m64 358 in (__m64 src, __m64 mask) 359 { 360 return pix_multiply (src, mask); 361 } 362 363 #ifndef _MSC_VER 364 static force_inline __m64 365 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest) 366 { 367 return over (in (src, mask), pix_multiply (srca, mask), dest); 368 } 369 370 #else 371 372 #define in_over(src, srca, mask, dest) \ 373 over (in (src, mask), pix_multiply (srca, mask), dest) 374 375 #endif 376 377 /* Elemental unaligned loads */ 378 379 static force_inline __m64 ldq_u(__m64 *p) 380 { 381 #ifdef USE_X86_MMX 382 /* x86's alignment restrictions are very relaxed. */ 383 return *(__m64 *)p; 384 #elif defined USE_ARM_IWMMXT 385 int align = (uintptr_t)p & 7; 386 __m64 *aligned_p; 387 if (align == 0) 388 return *p; 389 aligned_p = (__m64 *)((uintptr_t)p & ~7); 390 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align); 391 #else 392 struct __una_u64 { __m64 x __attribute__((packed)); }; 393 const struct __una_u64 *ptr = (const struct __una_u64 *) p; 394 return (__m64) ptr->x; 395 #endif 396 } 397 398 static force_inline uint32_t ldl_u(const uint32_t *p) 399 { 400 #ifdef USE_X86_MMX 401 /* x86's alignment restrictions are very relaxed. */ 402 return *p; 403 #else 404 struct __una_u32 { uint32_t x __attribute__((packed)); }; 405 const struct __una_u32 *ptr = (const struct __una_u32 *) p; 406 return ptr->x; 407 #endif 408 } 409 410 static force_inline __m64 411 load (const uint32_t *v) 412 { 413 #ifdef USE_LOONGSON_MMI 414 __m64 ret; 415 asm ("lwc1 %0, %1\n\t" 416 : "=f" (ret) 417 : "m" (*v) 418 ); 419 return ret; 420 #else 421 return _mm_cvtsi32_si64 (*v); 422 #endif 423 } 424 425 static force_inline __m64 426 load8888 (const uint32_t *v) 427 { 428 #ifdef USE_LOONGSON_MMI 429 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ()); 430 #else 431 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ()); 432 #endif 433 } 434 435 static force_inline __m64 436 load8888u (const uint32_t *v) 437 { 438 uint32_t l = ldl_u (v); 439 return load8888 (&l); 440 } 441 442 static force_inline __m64 443 pack8888 (__m64 lo, __m64 hi) 444 { 445 return _mm_packs_pu16 (lo, hi); 446 } 447 448 static force_inline void 449 store (uint32_t *dest, __m64 v) 450 { 451 #ifdef USE_LOONGSON_MMI 452 asm ("swc1 %1, %0\n\t" 453 : "=m" (*dest) 454 : "f" (v) 455 : "memory" 456 ); 457 #else 458 *dest = _mm_cvtsi64_si32 (v); 459 #endif 460 } 461 462 static force_inline void 463 store8888 (uint32_t *dest, __m64 v) 464 { 465 v = pack8888 (v, _mm_setzero_si64 ()); 466 store (dest, v); 467 } 468 469 static force_inline pixman_bool_t 470 is_equal (__m64 a, __m64 b) 471 { 472 #ifdef USE_LOONGSON_MMI 473 /* __m64 is double, we can compare directly. */ 474 return a == b; 475 #else 476 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff; 477 #endif 478 } 479 480 static force_inline pixman_bool_t 481 is_opaque (__m64 v) 482 { 483 #ifdef USE_LOONGSON_MMI 484 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha)); 485 #else 486 __m64 ffs = _mm_cmpeq_pi8 (v, v); 487 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40); 488 #endif 489 } 490 491 static force_inline pixman_bool_t 492 is_zero (__m64 v) 493 { 494 return is_equal (v, _mm_setzero_si64 ()); 495 } 496 497 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into 498 * 499 * 00RR00GG00BB 500 * 501 * --- Expanding 565 in the low word --- 502 * 503 * m = (m << (32 - 3)) | (m << (16 - 5)) | m; 504 * m = m & (01f0003f001f); 505 * m = m * (008404100840); 506 * m = m >> 8; 507 * 508 * Note the trick here - the top word is shifted by another nibble to 509 * avoid it bumping into the middle word 510 */ 511 static force_inline __m64 512 expand565 (__m64 pixel, int pos) 513 { 514 __m64 p = pixel; 515 __m64 t1, t2; 516 517 /* move pixel to low 16 bit and zero the rest */ 518 #ifdef USE_LOONGSON_MMI 519 p = loongson_extract_pi16 (p, pos); 520 #else 521 p = shift (shift (p, (3 - pos) * 16), -48); 522 #endif 523 524 t1 = shift (p, 36 - 11); 525 t2 = shift (p, 16 - 5); 526 527 p = _mm_or_si64 (t1, p); 528 p = _mm_or_si64 (t2, p); 529 p = _mm_and_si64 (p, MC (565_rgb)); 530 531 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier)); 532 return _mm_srli_pi16 (pixel, 8); 533 } 534 535 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of 536 * 537 * AARRGGBBRRGGBB 538 */ 539 static force_inline void 540 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha) 541 { 542 __m64 t0, t1, alpha = _mm_setzero_si64 (); 543 __m64 r = _mm_and_si64 (vin, MC (expand_565_r)); 544 __m64 g = _mm_and_si64 (vin, MC (expand_565_g)); 545 __m64 b = _mm_and_si64 (vin, MC (expand_565_b)); 546 if (full_alpha) 547 alpha = _mm_cmpeq_pi32 (alpha, alpha); 548 549 /* Replicate high bits into empty low bits. */ 550 r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13)); 551 g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9)); 552 b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2)); 553 554 r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */ 555 g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */ 556 b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */ 557 558 t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */ 559 t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */ 560 561 *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */ 562 *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */ 563 } 564 565 static force_inline __m64 566 expand8888 (__m64 in, int pos) 567 { 568 if (pos == 0) 569 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ()); 570 else 571 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ()); 572 } 573 574 static force_inline __m64 575 expandx888 (__m64 in, int pos) 576 { 577 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); 578 } 579 580 static force_inline void 581 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha) 582 { 583 __m64 v0, v1; 584 expand_4xpacked565 (vin, &v0, &v1, full_alpha); 585 *vout0 = expand8888 (v0, 0); 586 *vout1 = expand8888 (v0, 1); 587 *vout2 = expand8888 (v1, 0); 588 *vout3 = expand8888 (v1, 1); 589 } 590 591 static force_inline __m64 592 pack_565 (__m64 pixel, __m64 target, int pos) 593 { 594 __m64 p = pixel; 595 __m64 t = target; 596 __m64 r, g, b; 597 598 r = _mm_and_si64 (p, MC (565_r)); 599 g = _mm_and_si64 (p, MC (565_g)); 600 b = _mm_and_si64 (p, MC (565_b)); 601 602 #ifdef USE_LOONGSON_MMI 603 r = shift (r, -(32 - 8)); 604 g = shift (g, -(16 - 3)); 605 b = shift (b, -(0 + 3)); 606 607 p = _mm_or_si64 (r, g); 608 p = _mm_or_si64 (p, b); 609 return loongson_insert_pi16 (t, p, pos); 610 #else 611 r = shift (r, -(32 - 8) + pos * 16); 612 g = shift (g, -(16 - 3) + pos * 16); 613 b = shift (b, -(0 + 3) + pos * 16); 614 615 if (pos == 0) 616 t = _mm_and_si64 (t, MC (mask_0)); 617 else if (pos == 1) 618 t = _mm_and_si64 (t, MC (mask_1)); 619 else if (pos == 2) 620 t = _mm_and_si64 (t, MC (mask_2)); 621 else if (pos == 3) 622 t = _mm_and_si64 (t, MC (mask_3)); 623 624 p = _mm_or_si64 (r, t); 625 p = _mm_or_si64 (g, p); 626 627 return _mm_or_si64 (b, p); 628 #endif 629 } 630 631 static force_inline __m64 632 pack_4xpacked565 (__m64 a, __m64 b) 633 { 634 __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb)); 635 __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb)); 636 637 __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier)); 638 __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier)); 639 640 __m64 g0 = _mm_and_si64 (a, MC (packed_565_g)); 641 __m64 g1 = _mm_and_si64 (b, MC (packed_565_g)); 642 643 t0 = _mm_or_si64 (t0, g0); 644 t1 = _mm_or_si64 (t1, g1); 645 646 t0 = shift(t0, -5); 647 #ifdef USE_ARM_IWMMXT 648 t1 = shift(t1, -5); 649 return _mm_packs_pu32 (t0, t1); 650 #else 651 t1 = shift(t1, -5 + 16); 652 return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); 653 #endif 654 } 655 656 #ifndef _MSC_VER 657 658 static force_inline __m64 659 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3) 660 { 661 return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)); 662 } 663 664 static force_inline __m64 665 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) 666 { 667 x = pix_multiply (x, a); 668 y = pix_multiply (y, b); 669 670 return pix_add (x, y); 671 } 672 673 #else 674 675 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */ 676 677 #define pack_4x565(v0, v1, v2, v3) \ 678 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)) 679 680 #define pix_add_mul(x, a, y, b) \ 681 ( x = pix_multiply (x, a), \ 682 y = pix_multiply (y, b), \ 683 pix_add (x, y) ) 684 685 #endif 686 687 /* --------------- MMX code patch for fbcompose.c --------------------- */ 688 689 static force_inline __m64 690 combine (const uint32_t *src, const uint32_t *mask) 691 { 692 __m64 vsrc = load8888 (src); 693 694 if (mask) 695 { 696 __m64 m = load8888 (mask); 697 698 m = expand_alpha (m); 699 vsrc = pix_multiply (vsrc, m); 700 } 701 702 return vsrc; 703 } 704 705 static force_inline __m64 706 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) 707 { 708 vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ()); 709 710 if (is_opaque (vsrc)) 711 { 712 return vsrc; 713 } 714 else if (!is_zero (vsrc)) 715 { 716 return over (vsrc, expand_alpha (vsrc), 717 _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())); 718 } 719 720 return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); 721 } 722 723 static void 724 mmx_combine_over_u (pixman_implementation_t *imp, 725 pixman_op_t op, 726 uint32_t * dest, 727 const uint32_t * src, 728 const uint32_t * mask, 729 int width) 730 { 731 const uint32_t *end = dest + width; 732 733 while (dest < end) 734 { 735 __m64 vsrc = combine (src, mask); 736 737 if (is_opaque (vsrc)) 738 { 739 store8888 (dest, vsrc); 740 } 741 else if (!is_zero (vsrc)) 742 { 743 __m64 sa = expand_alpha (vsrc); 744 store8888 (dest, over (vsrc, sa, load8888 (dest))); 745 } 746 747 ++dest; 748 ++src; 749 if (mask) 750 ++mask; 751 } 752 _mm_empty (); 753 } 754 755 static void 756 mmx_combine_over_reverse_u (pixman_implementation_t *imp, 757 pixman_op_t op, 758 uint32_t * dest, 759 const uint32_t * src, 760 const uint32_t * mask, 761 int width) 762 { 763 const uint32_t *end = dest + width; 764 765 while (dest < end) 766 { 767 __m64 d, da; 768 __m64 s = combine (src, mask); 769 770 d = load8888 (dest); 771 da = expand_alpha (d); 772 store8888 (dest, over (d, da, s)); 773 774 ++dest; 775 ++src; 776 if (mask) 777 mask++; 778 } 779 _mm_empty (); 780 } 781 782 static void 783 mmx_combine_in_u (pixman_implementation_t *imp, 784 pixman_op_t op, 785 uint32_t * dest, 786 const uint32_t * src, 787 const uint32_t * mask, 788 int width) 789 { 790 const uint32_t *end = dest + width; 791 792 while (dest < end) 793 { 794 __m64 a; 795 __m64 x = combine (src, mask); 796 797 a = load8888 (dest); 798 a = expand_alpha (a); 799 x = pix_multiply (x, a); 800 801 store8888 (dest, x); 802 803 ++dest; 804 ++src; 805 if (mask) 806 mask++; 807 } 808 _mm_empty (); 809 } 810 811 static void 812 mmx_combine_in_reverse_u (pixman_implementation_t *imp, 813 pixman_op_t op, 814 uint32_t * dest, 815 const uint32_t * src, 816 const uint32_t * mask, 817 int width) 818 { 819 const uint32_t *end = dest + width; 820 821 while (dest < end) 822 { 823 __m64 a = combine (src, mask); 824 __m64 x; 825 826 x = load8888 (dest); 827 a = expand_alpha (a); 828 x = pix_multiply (x, a); 829 store8888 (dest, x); 830 831 ++dest; 832 ++src; 833 if (mask) 834 mask++; 835 } 836 _mm_empty (); 837 } 838 839 static void 840 mmx_combine_out_u (pixman_implementation_t *imp, 841 pixman_op_t op, 842 uint32_t * dest, 843 const uint32_t * src, 844 const uint32_t * mask, 845 int width) 846 { 847 const uint32_t *end = dest + width; 848 849 while (dest < end) 850 { 851 __m64 a; 852 __m64 x = combine (src, mask); 853 854 a = load8888 (dest); 855 a = expand_alpha (a); 856 a = negate (a); 857 x = pix_multiply (x, a); 858 store8888 (dest, x); 859 860 ++dest; 861 ++src; 862 if (mask) 863 mask++; 864 } 865 _mm_empty (); 866 } 867 868 static void 869 mmx_combine_out_reverse_u (pixman_implementation_t *imp, 870 pixman_op_t op, 871 uint32_t * dest, 872 const uint32_t * src, 873 const uint32_t * mask, 874 int width) 875 { 876 const uint32_t *end = dest + width; 877 878 while (dest < end) 879 { 880 __m64 a = combine (src, mask); 881 __m64 x; 882 883 x = load8888 (dest); 884 a = expand_alpha (a); 885 a = negate (a); 886 x = pix_multiply (x, a); 887 888 store8888 (dest, x); 889 890 ++dest; 891 ++src; 892 if (mask) 893 mask++; 894 } 895 _mm_empty (); 896 } 897 898 static void 899 mmx_combine_atop_u (pixman_implementation_t *imp, 900 pixman_op_t op, 901 uint32_t * dest, 902 const uint32_t * src, 903 const uint32_t * mask, 904 int width) 905 { 906 const uint32_t *end = dest + width; 907 908 while (dest < end) 909 { 910 __m64 da, d, sia; 911 __m64 s = combine (src, mask); 912 913 d = load8888 (dest); 914 sia = expand_alpha (s); 915 sia = negate (sia); 916 da = expand_alpha (d); 917 s = pix_add_mul (s, da, d, sia); 918 store8888 (dest, s); 919 920 ++dest; 921 ++src; 922 if (mask) 923 mask++; 924 } 925 _mm_empty (); 926 } 927 928 static void 929 mmx_combine_atop_reverse_u (pixman_implementation_t *imp, 930 pixman_op_t op, 931 uint32_t * dest, 932 const uint32_t * src, 933 const uint32_t * mask, 934 int width) 935 { 936 const uint32_t *end; 937 938 end = dest + width; 939 940 while (dest < end) 941 { 942 __m64 dia, d, sa; 943 __m64 s = combine (src, mask); 944 945 d = load8888 (dest); 946 sa = expand_alpha (s); 947 dia = expand_alpha (d); 948 dia = negate (dia); 949 s = pix_add_mul (s, dia, d, sa); 950 store8888 (dest, s); 951 952 ++dest; 953 ++src; 954 if (mask) 955 mask++; 956 } 957 _mm_empty (); 958 } 959 960 static void 961 mmx_combine_xor_u (pixman_implementation_t *imp, 962 pixman_op_t op, 963 uint32_t * dest, 964 const uint32_t * src, 965 const uint32_t * mask, 966 int width) 967 { 968 const uint32_t *end = dest + width; 969 970 while (dest < end) 971 { 972 __m64 dia, d, sia; 973 __m64 s = combine (src, mask); 974 975 d = load8888 (dest); 976 sia = expand_alpha (s); 977 dia = expand_alpha (d); 978 sia = negate (sia); 979 dia = negate (dia); 980 s = pix_add_mul (s, dia, d, sia); 981 store8888 (dest, s); 982 983 ++dest; 984 ++src; 985 if (mask) 986 mask++; 987 } 988 _mm_empty (); 989 } 990 991 static void 992 mmx_combine_add_u (pixman_implementation_t *imp, 993 pixman_op_t op, 994 uint32_t * dest, 995 const uint32_t * src, 996 const uint32_t * mask, 997 int width) 998 { 999 const uint32_t *end = dest + width; 1000 1001 while (dest < end) 1002 { 1003 __m64 d; 1004 __m64 s = combine (src, mask); 1005 1006 d = load8888 (dest); 1007 s = pix_add (s, d); 1008 store8888 (dest, s); 1009 1010 ++dest; 1011 ++src; 1012 if (mask) 1013 mask++; 1014 } 1015 _mm_empty (); 1016 } 1017 1018 static void 1019 mmx_combine_saturate_u (pixman_implementation_t *imp, 1020 pixman_op_t op, 1021 uint32_t * dest, 1022 const uint32_t * src, 1023 const uint32_t * mask, 1024 int width) 1025 { 1026 const uint32_t *end = dest + width; 1027 1028 while (dest < end) 1029 { 1030 uint32_t s, sa, da; 1031 uint32_t d = *dest; 1032 __m64 ms = combine (src, mask); 1033 __m64 md = load8888 (dest); 1034 1035 store8888(&s, ms); 1036 da = ~d >> 24; 1037 sa = s >> 24; 1038 1039 if (sa > da) 1040 { 1041 uint32_t quot = DIV_UN8 (da, sa) << 24; 1042 __m64 msa = load8888 ("); 1043 msa = expand_alpha (msa); 1044 ms = pix_multiply (ms, msa); 1045 } 1046 1047 md = pix_add (md, ms); 1048 store8888 (dest, md); 1049 1050 ++src; 1051 ++dest; 1052 if (mask) 1053 mask++; 1054 } 1055 _mm_empty (); 1056 } 1057 1058 static void 1059 mmx_combine_src_ca (pixman_implementation_t *imp, 1060 pixman_op_t op, 1061 uint32_t * dest, 1062 const uint32_t * src, 1063 const uint32_t * mask, 1064 int width) 1065 { 1066 const uint32_t *end = src + width; 1067 1068 while (src < end) 1069 { 1070 __m64 a = load8888 (mask); 1071 __m64 s = load8888 (src); 1072 1073 s = pix_multiply (s, a); 1074 store8888 (dest, s); 1075 1076 ++src; 1077 ++mask; 1078 ++dest; 1079 } 1080 _mm_empty (); 1081 } 1082 1083 static void 1084 mmx_combine_over_ca (pixman_implementation_t *imp, 1085 pixman_op_t op, 1086 uint32_t * dest, 1087 const uint32_t * src, 1088 const uint32_t * mask, 1089 int width) 1090 { 1091 const uint32_t *end = src + width; 1092 1093 while (src < end) 1094 { 1095 __m64 a = load8888 (mask); 1096 __m64 s = load8888 (src); 1097 __m64 d = load8888 (dest); 1098 __m64 sa = expand_alpha (s); 1099 1100 store8888 (dest, in_over (s, sa, a, d)); 1101 1102 ++src; 1103 ++dest; 1104 ++mask; 1105 } 1106 _mm_empty (); 1107 } 1108 1109 static void 1110 mmx_combine_over_reverse_ca (pixman_implementation_t *imp, 1111 pixman_op_t op, 1112 uint32_t * dest, 1113 const uint32_t * src, 1114 const uint32_t * mask, 1115 int width) 1116 { 1117 const uint32_t *end = src + width; 1118 1119 while (src < end) 1120 { 1121 __m64 a = load8888 (mask); 1122 __m64 s = load8888 (src); 1123 __m64 d = load8888 (dest); 1124 __m64 da = expand_alpha (d); 1125 1126 store8888 (dest, over (d, da, in (s, a))); 1127 1128 ++src; 1129 ++dest; 1130 ++mask; 1131 } 1132 _mm_empty (); 1133 } 1134 1135 static void 1136 mmx_combine_in_ca (pixman_implementation_t *imp, 1137 pixman_op_t op, 1138 uint32_t * dest, 1139 const uint32_t * src, 1140 const uint32_t * mask, 1141 int width) 1142 { 1143 const uint32_t *end = src + width; 1144 1145 while (src < end) 1146 { 1147 __m64 a = load8888 (mask); 1148 __m64 s = load8888 (src); 1149 __m64 d = load8888 (dest); 1150 __m64 da = expand_alpha (d); 1151 1152 s = pix_multiply (s, a); 1153 s = pix_multiply (s, da); 1154 store8888 (dest, s); 1155 1156 ++src; 1157 ++dest; 1158 ++mask; 1159 } 1160 _mm_empty (); 1161 } 1162 1163 static void 1164 mmx_combine_in_reverse_ca (pixman_implementation_t *imp, 1165 pixman_op_t op, 1166 uint32_t * dest, 1167 const uint32_t * src, 1168 const uint32_t * mask, 1169 int width) 1170 { 1171 const uint32_t *end = src + width; 1172 1173 while (src < end) 1174 { 1175 __m64 a = load8888 (mask); 1176 __m64 s = load8888 (src); 1177 __m64 d = load8888 (dest); 1178 __m64 sa = expand_alpha (s); 1179 1180 a = pix_multiply (a, sa); 1181 d = pix_multiply (d, a); 1182 store8888 (dest, d); 1183 1184 ++src; 1185 ++dest; 1186 ++mask; 1187 } 1188 _mm_empty (); 1189 } 1190 1191 static void 1192 mmx_combine_out_ca (pixman_implementation_t *imp, 1193 pixman_op_t op, 1194 uint32_t * dest, 1195 const uint32_t * src, 1196 const uint32_t * mask, 1197 int width) 1198 { 1199 const uint32_t *end = src + width; 1200 1201 while (src < end) 1202 { 1203 __m64 a = load8888 (mask); 1204 __m64 s = load8888 (src); 1205 __m64 d = load8888 (dest); 1206 __m64 da = expand_alpha (d); 1207 1208 da = negate (da); 1209 s = pix_multiply (s, a); 1210 s = pix_multiply (s, da); 1211 store8888 (dest, s); 1212 1213 ++src; 1214 ++dest; 1215 ++mask; 1216 } 1217 _mm_empty (); 1218 } 1219 1220 static void 1221 mmx_combine_out_reverse_ca (pixman_implementation_t *imp, 1222 pixman_op_t op, 1223 uint32_t * dest, 1224 const uint32_t * src, 1225 const uint32_t * mask, 1226 int width) 1227 { 1228 const uint32_t *end = src + width; 1229 1230 while (src < end) 1231 { 1232 __m64 a = load8888 (mask); 1233 __m64 s = load8888 (src); 1234 __m64 d = load8888 (dest); 1235 __m64 sa = expand_alpha (s); 1236 1237 a = pix_multiply (a, sa); 1238 a = negate (a); 1239 d = pix_multiply (d, a); 1240 store8888 (dest, d); 1241 1242 ++src; 1243 ++dest; 1244 ++mask; 1245 } 1246 _mm_empty (); 1247 } 1248 1249 static void 1250 mmx_combine_atop_ca (pixman_implementation_t *imp, 1251 pixman_op_t op, 1252 uint32_t * dest, 1253 const uint32_t * src, 1254 const uint32_t * mask, 1255 int width) 1256 { 1257 const uint32_t *end = src + width; 1258 1259 while (src < end) 1260 { 1261 __m64 a = load8888 (mask); 1262 __m64 s = load8888 (src); 1263 __m64 d = load8888 (dest); 1264 __m64 da = expand_alpha (d); 1265 __m64 sa = expand_alpha (s); 1266 1267 s = pix_multiply (s, a); 1268 a = pix_multiply (a, sa); 1269 a = negate (a); 1270 d = pix_add_mul (d, a, s, da); 1271 store8888 (dest, d); 1272 1273 ++src; 1274 ++dest; 1275 ++mask; 1276 } 1277 _mm_empty (); 1278 } 1279 1280 static void 1281 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, 1282 pixman_op_t op, 1283 uint32_t * dest, 1284 const uint32_t * src, 1285 const uint32_t * mask, 1286 int width) 1287 { 1288 const uint32_t *end = src + width; 1289 1290 while (src < end) 1291 { 1292 __m64 a = load8888 (mask); 1293 __m64 s = load8888 (src); 1294 __m64 d = load8888 (dest); 1295 __m64 da = expand_alpha (d); 1296 __m64 sa = expand_alpha (s); 1297 1298 s = pix_multiply (s, a); 1299 a = pix_multiply (a, sa); 1300 da = negate (da); 1301 d = pix_add_mul (d, a, s, da); 1302 store8888 (dest, d); 1303 1304 ++src; 1305 ++dest; 1306 ++mask; 1307 } 1308 _mm_empty (); 1309 } 1310 1311 static void 1312 mmx_combine_xor_ca (pixman_implementation_t *imp, 1313 pixman_op_t op, 1314 uint32_t * dest, 1315 const uint32_t * src, 1316 const uint32_t * mask, 1317 int width) 1318 { 1319 const uint32_t *end = src + width; 1320 1321 while (src < end) 1322 { 1323 __m64 a = load8888 (mask); 1324 __m64 s = load8888 (src); 1325 __m64 d = load8888 (dest); 1326 __m64 da = expand_alpha (d); 1327 __m64 sa = expand_alpha (s); 1328 1329 s = pix_multiply (s, a); 1330 a = pix_multiply (a, sa); 1331 da = negate (da); 1332 a = negate (a); 1333 d = pix_add_mul (d, a, s, da); 1334 store8888 (dest, d); 1335 1336 ++src; 1337 ++dest; 1338 ++mask; 1339 } 1340 _mm_empty (); 1341 } 1342 1343 static void 1344 mmx_combine_add_ca (pixman_implementation_t *imp, 1345 pixman_op_t op, 1346 uint32_t * dest, 1347 const uint32_t * src, 1348 const uint32_t * mask, 1349 int width) 1350 { 1351 const uint32_t *end = src + width; 1352 1353 while (src < end) 1354 { 1355 __m64 a = load8888 (mask); 1356 __m64 s = load8888 (src); 1357 __m64 d = load8888 (dest); 1358 1359 s = pix_multiply (s, a); 1360 d = pix_add (s, d); 1361 store8888 (dest, d); 1362 1363 ++src; 1364 ++dest; 1365 ++mask; 1366 } 1367 _mm_empty (); 1368 } 1369 1370 /* ------------- MMX code paths called from fbpict.c -------------------- */ 1371 1372 static void 1373 mmx_composite_over_n_8888 (pixman_implementation_t *imp, 1374 pixman_composite_info_t *info) 1375 { 1376 PIXMAN_COMPOSITE_ARGS (info); 1377 uint32_t src; 1378 uint32_t *dst_line, *dst; 1379 int32_t w; 1380 int dst_stride; 1381 __m64 vsrc, vsrca; 1382 1383 CHECKPOINT (); 1384 1385 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1386 1387 if (src == 0) 1388 return; 1389 1390 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1391 1392 vsrc = load8888 (&src); 1393 vsrca = expand_alpha (vsrc); 1394 1395 while (height--) 1396 { 1397 dst = dst_line; 1398 dst_line += dst_stride; 1399 w = width; 1400 1401 CHECKPOINT (); 1402 1403 while (w && (uintptr_t)dst & 7) 1404 { 1405 store8888 (dst, over (vsrc, vsrca, load8888 (dst))); 1406 1407 w--; 1408 dst++; 1409 } 1410 1411 while (w >= 2) 1412 { 1413 __m64 vdest; 1414 __m64 dest0, dest1; 1415 1416 vdest = *(__m64 *)dst; 1417 1418 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0)); 1419 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1)); 1420 1421 *(__m64 *)dst = pack8888 (dest0, dest1); 1422 1423 dst += 2; 1424 w -= 2; 1425 } 1426 1427 CHECKPOINT (); 1428 1429 if (w) 1430 { 1431 store8888 (dst, over (vsrc, vsrca, load8888 (dst))); 1432 } 1433 } 1434 1435 _mm_empty (); 1436 } 1437 1438 static void 1439 mmx_composite_over_n_0565 (pixman_implementation_t *imp, 1440 pixman_composite_info_t *info) 1441 { 1442 PIXMAN_COMPOSITE_ARGS (info); 1443 uint32_t src; 1444 uint16_t *dst_line, *dst; 1445 int32_t w; 1446 int dst_stride; 1447 __m64 vsrc, vsrca; 1448 1449 CHECKPOINT (); 1450 1451 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1452 1453 if (src == 0) 1454 return; 1455 1456 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1457 1458 vsrc = load8888 (&src); 1459 vsrca = expand_alpha (vsrc); 1460 1461 while (height--) 1462 { 1463 dst = dst_line; 1464 dst_line += dst_stride; 1465 w = width; 1466 1467 CHECKPOINT (); 1468 1469 while (w && (uintptr_t)dst & 7) 1470 { 1471 uint64_t d = *dst; 1472 __m64 vdest = expand565 (to_m64 (d), 0); 1473 1474 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); 1475 *dst = to_uint64 (vdest); 1476 1477 w--; 1478 dst++; 1479 } 1480 1481 while (w >= 4) 1482 { 1483 __m64 vdest = *(__m64 *)dst; 1484 __m64 v0, v1, v2, v3; 1485 1486 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 1487 1488 v0 = over (vsrc, vsrca, v0); 1489 v1 = over (vsrc, vsrca, v1); 1490 v2 = over (vsrc, vsrca, v2); 1491 v3 = over (vsrc, vsrca, v3); 1492 1493 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 1494 1495 dst += 4; 1496 w -= 4; 1497 } 1498 1499 CHECKPOINT (); 1500 1501 while (w) 1502 { 1503 uint64_t d = *dst; 1504 __m64 vdest = expand565 (to_m64 (d), 0); 1505 1506 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); 1507 *dst = to_uint64 (vdest); 1508 1509 w--; 1510 dst++; 1511 } 1512 } 1513 1514 _mm_empty (); 1515 } 1516 1517 static void 1518 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, 1519 pixman_composite_info_t *info) 1520 { 1521 PIXMAN_COMPOSITE_ARGS (info); 1522 uint32_t src; 1523 uint32_t *dst_line; 1524 uint32_t *mask_line; 1525 int dst_stride, mask_stride; 1526 __m64 vsrc, vsrca; 1527 1528 CHECKPOINT (); 1529 1530 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1531 1532 if (src == 0) 1533 return; 1534 1535 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1536 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 1537 1538 vsrc = load8888 (&src); 1539 vsrca = expand_alpha (vsrc); 1540 1541 while (height--) 1542 { 1543 int twidth = width; 1544 uint32_t *p = (uint32_t *)mask_line; 1545 uint32_t *q = (uint32_t *)dst_line; 1546 1547 while (twidth && (uintptr_t)q & 7) 1548 { 1549 uint32_t m = *(uint32_t *)p; 1550 1551 if (m) 1552 { 1553 __m64 vdest = load8888 (q); 1554 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); 1555 store8888 (q, vdest); 1556 } 1557 1558 twidth--; 1559 p++; 1560 q++; 1561 } 1562 1563 while (twidth >= 2) 1564 { 1565 uint32_t m0, m1; 1566 m0 = *p; 1567 m1 = *(p + 1); 1568 1569 if (m0 | m1) 1570 { 1571 __m64 dest0, dest1; 1572 __m64 vdest = *(__m64 *)q; 1573 1574 dest0 = in_over (vsrc, vsrca, load8888 (&m0), 1575 expand8888 (vdest, 0)); 1576 dest1 = in_over (vsrc, vsrca, load8888 (&m1), 1577 expand8888 (vdest, 1)); 1578 1579 *(__m64 *)q = pack8888 (dest0, dest1); 1580 } 1581 1582 p += 2; 1583 q += 2; 1584 twidth -= 2; 1585 } 1586 1587 if (twidth) 1588 { 1589 uint32_t m = *(uint32_t *)p; 1590 1591 if (m) 1592 { 1593 __m64 vdest = load8888 (q); 1594 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); 1595 store8888 (q, vdest); 1596 } 1597 1598 twidth--; 1599 p++; 1600 q++; 1601 } 1602 1603 dst_line += dst_stride; 1604 mask_line += mask_stride; 1605 } 1606 1607 _mm_empty (); 1608 } 1609 1610 static void 1611 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, 1612 pixman_composite_info_t *info) 1613 { 1614 PIXMAN_COMPOSITE_ARGS (info); 1615 uint32_t *dst_line, *dst; 1616 uint32_t *src_line, *src; 1617 uint32_t mask; 1618 __m64 vmask; 1619 int dst_stride, src_stride; 1620 int32_t w; 1621 1622 CHECKPOINT (); 1623 1624 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1625 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1626 1627 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); 1628 vmask = expand_alpha (load8888 (&mask)); 1629 1630 while (height--) 1631 { 1632 dst = dst_line; 1633 dst_line += dst_stride; 1634 src = src_line; 1635 src_line += src_stride; 1636 w = width; 1637 1638 while (w && (uintptr_t)dst & 7) 1639 { 1640 __m64 s = load8888 (src); 1641 __m64 d = load8888 (dst); 1642 1643 store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); 1644 1645 w--; 1646 dst++; 1647 src++; 1648 } 1649 1650 while (w >= 2) 1651 { 1652 __m64 vs = ldq_u ((__m64 *)src); 1653 __m64 vd = *(__m64 *)dst; 1654 __m64 vsrc0 = expand8888 (vs, 0); 1655 __m64 vsrc1 = expand8888 (vs, 1); 1656 1657 *(__m64 *)dst = pack8888 ( 1658 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), 1659 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); 1660 1661 w -= 2; 1662 dst += 2; 1663 src += 2; 1664 } 1665 1666 if (w) 1667 { 1668 __m64 s = load8888 (src); 1669 __m64 d = load8888 (dst); 1670 1671 store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); 1672 } 1673 } 1674 1675 _mm_empty (); 1676 } 1677 1678 static void 1679 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, 1680 pixman_composite_info_t *info) 1681 { 1682 PIXMAN_COMPOSITE_ARGS (info); 1683 uint32_t *dst_line, *dst; 1684 uint32_t *src_line, *src; 1685 uint32_t mask; 1686 __m64 vmask; 1687 int dst_stride, src_stride; 1688 int32_t w; 1689 __m64 srca; 1690 1691 CHECKPOINT (); 1692 1693 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1694 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1695 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); 1696 1697 vmask = expand_alpha (load8888 (&mask)); 1698 srca = MC (4x00ff); 1699 1700 while (height--) 1701 { 1702 dst = dst_line; 1703 dst_line += dst_stride; 1704 src = src_line; 1705 src_line += src_stride; 1706 w = width; 1707 1708 while (w && (uintptr_t)dst & 7) 1709 { 1710 uint32_t ssrc = *src | 0xff000000; 1711 __m64 s = load8888 (&ssrc); 1712 __m64 d = load8888 (dst); 1713 1714 store8888 (dst, in_over (s, srca, vmask, d)); 1715 1716 w--; 1717 dst++; 1718 src++; 1719 } 1720 1721 while (w >= 16) 1722 { 1723 __m64 vd0 = *(__m64 *)(dst + 0); 1724 __m64 vd1 = *(__m64 *)(dst + 2); 1725 __m64 vd2 = *(__m64 *)(dst + 4); 1726 __m64 vd3 = *(__m64 *)(dst + 6); 1727 __m64 vd4 = *(__m64 *)(dst + 8); 1728 __m64 vd5 = *(__m64 *)(dst + 10); 1729 __m64 vd6 = *(__m64 *)(dst + 12); 1730 __m64 vd7 = *(__m64 *)(dst + 14); 1731 1732 __m64 vs0 = ldq_u ((__m64 *)(src + 0)); 1733 __m64 vs1 = ldq_u ((__m64 *)(src + 2)); 1734 __m64 vs2 = ldq_u ((__m64 *)(src + 4)); 1735 __m64 vs3 = ldq_u ((__m64 *)(src + 6)); 1736 __m64 vs4 = ldq_u ((__m64 *)(src + 8)); 1737 __m64 vs5 = ldq_u ((__m64 *)(src + 10)); 1738 __m64 vs6 = ldq_u ((__m64 *)(src + 12)); 1739 __m64 vs7 = ldq_u ((__m64 *)(src + 14)); 1740 1741 vd0 = pack8888 ( 1742 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), 1743 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); 1744 1745 vd1 = pack8888 ( 1746 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), 1747 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); 1748 1749 vd2 = pack8888 ( 1750 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), 1751 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); 1752 1753 vd3 = pack8888 ( 1754 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), 1755 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); 1756 1757 vd4 = pack8888 ( 1758 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), 1759 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); 1760 1761 vd5 = pack8888 ( 1762 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), 1763 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); 1764 1765 vd6 = pack8888 ( 1766 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), 1767 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); 1768 1769 vd7 = pack8888 ( 1770 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), 1771 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); 1772 1773 *(__m64 *)(dst + 0) = vd0; 1774 *(__m64 *)(dst + 2) = vd1; 1775 *(__m64 *)(dst + 4) = vd2; 1776 *(__m64 *)(dst + 6) = vd3; 1777 *(__m64 *)(dst + 8) = vd4; 1778 *(__m64 *)(dst + 10) = vd5; 1779 *(__m64 *)(dst + 12) = vd6; 1780 *(__m64 *)(dst + 14) = vd7; 1781 1782 w -= 16; 1783 dst += 16; 1784 src += 16; 1785 } 1786 1787 while (w) 1788 { 1789 uint32_t ssrc = *src | 0xff000000; 1790 __m64 s = load8888 (&ssrc); 1791 __m64 d = load8888 (dst); 1792 1793 store8888 (dst, in_over (s, srca, vmask, d)); 1794 1795 w--; 1796 dst++; 1797 src++; 1798 } 1799 } 1800 1801 _mm_empty (); 1802 } 1803 1804 static void 1805 mmx_composite_over_8888_8888 (pixman_implementation_t *imp, 1806 pixman_composite_info_t *info) 1807 { 1808 PIXMAN_COMPOSITE_ARGS (info); 1809 uint32_t *dst_line, *dst; 1810 uint32_t *src_line, *src; 1811 uint32_t s; 1812 int dst_stride, src_stride; 1813 uint8_t a; 1814 int32_t w; 1815 1816 CHECKPOINT (); 1817 1818 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1819 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1820 1821 while (height--) 1822 { 1823 dst = dst_line; 1824 dst_line += dst_stride; 1825 src = src_line; 1826 src_line += src_stride; 1827 w = width; 1828 1829 while (w--) 1830 { 1831 s = *src++; 1832 a = s >> 24; 1833 1834 if (a == 0xff) 1835 { 1836 *dst = s; 1837 } 1838 else if (s) 1839 { 1840 __m64 ms, sa; 1841 ms = load8888 (&s); 1842 sa = expand_alpha (ms); 1843 store8888 (dst, over (ms, sa, load8888 (dst))); 1844 } 1845 1846 dst++; 1847 } 1848 } 1849 _mm_empty (); 1850 } 1851 1852 static void 1853 mmx_composite_over_8888_0565 (pixman_implementation_t *imp, 1854 pixman_composite_info_t *info) 1855 { 1856 PIXMAN_COMPOSITE_ARGS (info); 1857 uint16_t *dst_line, *dst; 1858 uint32_t *src_line, *src; 1859 int dst_stride, src_stride; 1860 int32_t w; 1861 1862 CHECKPOINT (); 1863 1864 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1865 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1866 1867 #if 0 1868 /* FIXME */ 1869 assert (src_image->drawable == mask_image->drawable); 1870 #endif 1871 1872 while (height--) 1873 { 1874 dst = dst_line; 1875 dst_line += dst_stride; 1876 src = src_line; 1877 src_line += src_stride; 1878 w = width; 1879 1880 CHECKPOINT (); 1881 1882 while (w && (uintptr_t)dst & 7) 1883 { 1884 __m64 vsrc = load8888 (src); 1885 uint64_t d = *dst; 1886 __m64 vdest = expand565 (to_m64 (d), 0); 1887 1888 vdest = pack_565 ( 1889 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); 1890 1891 *dst = to_uint64 (vdest); 1892 1893 w--; 1894 dst++; 1895 src++; 1896 } 1897 1898 CHECKPOINT (); 1899 1900 while (w >= 4) 1901 { 1902 __m64 vdest = *(__m64 *)dst; 1903 __m64 v0, v1, v2, v3; 1904 __m64 vsrc0, vsrc1, vsrc2, vsrc3; 1905 1906 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 1907 1908 vsrc0 = load8888 ((src + 0)); 1909 vsrc1 = load8888 ((src + 1)); 1910 vsrc2 = load8888 ((src + 2)); 1911 vsrc3 = load8888 ((src + 3)); 1912 1913 v0 = over (vsrc0, expand_alpha (vsrc0), v0); 1914 v1 = over (vsrc1, expand_alpha (vsrc1), v1); 1915 v2 = over (vsrc2, expand_alpha (vsrc2), v2); 1916 v3 = over (vsrc3, expand_alpha (vsrc3), v3); 1917 1918 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 1919 1920 w -= 4; 1921 dst += 4; 1922 src += 4; 1923 } 1924 1925 CHECKPOINT (); 1926 1927 while (w) 1928 { 1929 __m64 vsrc = load8888 (src); 1930 uint64_t d = *dst; 1931 __m64 vdest = expand565 (to_m64 (d), 0); 1932 1933 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); 1934 1935 *dst = to_uint64 (vdest); 1936 1937 w--; 1938 dst++; 1939 src++; 1940 } 1941 } 1942 1943 _mm_empty (); 1944 } 1945 1946 static void 1947 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, 1948 pixman_composite_info_t *info) 1949 { 1950 PIXMAN_COMPOSITE_ARGS (info); 1951 uint32_t src, srca; 1952 uint32_t *dst_line, *dst; 1953 uint8_t *mask_line, *mask; 1954 int dst_stride, mask_stride; 1955 int32_t w; 1956 __m64 vsrc, vsrca; 1957 uint64_t srcsrc; 1958 1959 CHECKPOINT (); 1960 1961 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1962 1963 srca = src >> 24; 1964 if (src == 0) 1965 return; 1966 1967 srcsrc = (uint64_t)src << 32 | src; 1968 1969 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1970 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1971 1972 vsrc = load8888 (&src); 1973 vsrca = expand_alpha (vsrc); 1974 1975 while (height--) 1976 { 1977 dst = dst_line; 1978 dst_line += dst_stride; 1979 mask = mask_line; 1980 mask_line += mask_stride; 1981 w = width; 1982 1983 CHECKPOINT (); 1984 1985 while (w && (uintptr_t)dst & 7) 1986 { 1987 uint64_t m = *mask; 1988 1989 if (m) 1990 { 1991 __m64 vdest = in_over (vsrc, vsrca, 1992 expand_alpha_rev (to_m64 (m)), 1993 load8888 (dst)); 1994 1995 store8888 (dst, vdest); 1996 } 1997 1998 w--; 1999 mask++; 2000 dst++; 2001 } 2002 2003 CHECKPOINT (); 2004 2005 while (w >= 2) 2006 { 2007 uint64_t m0, m1; 2008 2009 m0 = *mask; 2010 m1 = *(mask + 1); 2011 2012 if (srca == 0xff && (m0 & m1) == 0xff) 2013 { 2014 *(uint64_t *)dst = srcsrc; 2015 } 2016 else if (m0 | m1) 2017 { 2018 __m64 vdest; 2019 __m64 dest0, dest1; 2020 2021 vdest = *(__m64 *)dst; 2022 2023 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)), 2024 expand8888 (vdest, 0)); 2025 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)), 2026 expand8888 (vdest, 1)); 2027 2028 *(__m64 *)dst = pack8888 (dest0, dest1); 2029 } 2030 2031 mask += 2; 2032 dst += 2; 2033 w -= 2; 2034 } 2035 2036 CHECKPOINT (); 2037 2038 if (w) 2039 { 2040 uint64_t m = *mask; 2041 2042 if (m) 2043 { 2044 __m64 vdest = load8888 (dst); 2045 2046 vdest = in_over ( 2047 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); 2048 store8888 (dst, vdest); 2049 } 2050 } 2051 } 2052 2053 _mm_empty (); 2054 } 2055 2056 static pixman_bool_t 2057 mmx_fill (pixman_implementation_t *imp, 2058 uint32_t * bits, 2059 int stride, 2060 int bpp, 2061 int x, 2062 int y, 2063 int width, 2064 int height, 2065 uint32_t filler) 2066 { 2067 uint64_t fill; 2068 __m64 vfill; 2069 uint32_t byte_width; 2070 uint8_t *byte_line; 2071 2072 #if defined __GNUC__ && defined USE_X86_MMX 2073 __m64 v1, v2, v3, v4, v5, v6, v7; 2074 #endif 2075 2076 if (bpp != 16 && bpp != 32 && bpp != 8) 2077 return FALSE; 2078 2079 if (bpp == 8) 2080 { 2081 stride = stride * (int) sizeof (uint32_t) / 1; 2082 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); 2083 byte_width = width; 2084 stride *= 1; 2085 filler = (filler & 0xff) * 0x01010101; 2086 } 2087 else if (bpp == 16) 2088 { 2089 stride = stride * (int) sizeof (uint32_t) / 2; 2090 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); 2091 byte_width = 2 * width; 2092 stride *= 2; 2093 filler = (filler & 0xffff) * 0x00010001; 2094 } 2095 else 2096 { 2097 stride = stride * (int) sizeof (uint32_t) / 4; 2098 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); 2099 byte_width = 4 * width; 2100 stride *= 4; 2101 } 2102 2103 fill = ((uint64_t)filler << 32) | filler; 2104 vfill = to_m64 (fill); 2105 2106 #if defined __GNUC__ && defined USE_X86_MMX 2107 __asm__ ( 2108 "movq %7, %0\n" 2109 "movq %7, %1\n" 2110 "movq %7, %2\n" 2111 "movq %7, %3\n" 2112 "movq %7, %4\n" 2113 "movq %7, %5\n" 2114 "movq %7, %6\n" 2115 : "=&y" (v1), "=&y" (v2), "=&y" (v3), 2116 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7) 2117 : "y" (vfill)); 2118 #endif 2119 2120 while (height--) 2121 { 2122 int w; 2123 uint8_t *d = byte_line; 2124 2125 byte_line += stride; 2126 w = byte_width; 2127 2128 if (w >= 1 && ((uintptr_t)d & 1)) 2129 { 2130 *(uint8_t *)d = (filler & 0xff); 2131 w--; 2132 d++; 2133 } 2134 2135 if (w >= 2 && ((uintptr_t)d & 3)) 2136 { 2137 *(uint16_t *)d = filler; 2138 w -= 2; 2139 d += 2; 2140 } 2141 2142 while (w >= 4 && ((uintptr_t)d & 7)) 2143 { 2144 *(uint32_t *)d = filler; 2145 2146 w -= 4; 2147 d += 4; 2148 } 2149 2150 while (w >= 64) 2151 { 2152 #if defined __GNUC__ && defined USE_X86_MMX 2153 __asm__ ( 2154 "movq %1, (%0)\n" 2155 "movq %2, 8(%0)\n" 2156 "movq %3, 16(%0)\n" 2157 "movq %4, 24(%0)\n" 2158 "movq %5, 32(%0)\n" 2159 "movq %6, 40(%0)\n" 2160 "movq %7, 48(%0)\n" 2161 "movq %8, 56(%0)\n" 2162 : 2163 : "r" (d), 2164 "y" (vfill), "y" (v1), "y" (v2), "y" (v3), 2165 "y" (v4), "y" (v5), "y" (v6), "y" (v7) 2166 : "memory"); 2167 #else 2168 *(__m64*) (d + 0) = vfill; 2169 *(__m64*) (d + 8) = vfill; 2170 *(__m64*) (d + 16) = vfill; 2171 *(__m64*) (d + 24) = vfill; 2172 *(__m64*) (d + 32) = vfill; 2173 *(__m64*) (d + 40) = vfill; 2174 *(__m64*) (d + 48) = vfill; 2175 *(__m64*) (d + 56) = vfill; 2176 #endif 2177 w -= 64; 2178 d += 64; 2179 } 2180 2181 while (w >= 4) 2182 { 2183 *(uint32_t *)d = filler; 2184 2185 w -= 4; 2186 d += 4; 2187 } 2188 if (w >= 2) 2189 { 2190 *(uint16_t *)d = filler; 2191 w -= 2; 2192 d += 2; 2193 } 2194 if (w >= 1) 2195 { 2196 *(uint8_t *)d = (filler & 0xff); 2197 w--; 2198 d++; 2199 } 2200 2201 } 2202 2203 _mm_empty (); 2204 return TRUE; 2205 } 2206 2207 static void 2208 mmx_composite_src_x888_0565 (pixman_implementation_t *imp, 2209 pixman_composite_info_t *info) 2210 { 2211 PIXMAN_COMPOSITE_ARGS (info); 2212 uint16_t *dst_line, *dst; 2213 uint32_t *src_line, *src, s; 2214 int dst_stride, src_stride; 2215 int32_t w; 2216 2217 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2218 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2219 2220 while (height--) 2221 { 2222 dst = dst_line; 2223 dst_line += dst_stride; 2224 src = src_line; 2225 src_line += src_stride; 2226 w = width; 2227 2228 while (w && (uintptr_t)dst & 7) 2229 { 2230 s = *src++; 2231 *dst = convert_8888_to_0565 (s); 2232 dst++; 2233 w--; 2234 } 2235 2236 while (w >= 4) 2237 { 2238 __m64 vdest; 2239 __m64 vsrc0 = ldq_u ((__m64 *)(src + 0)); 2240 __m64 vsrc1 = ldq_u ((__m64 *)(src + 2)); 2241 2242 vdest = pack_4xpacked565 (vsrc0, vsrc1); 2243 2244 *(__m64 *)dst = vdest; 2245 2246 w -= 4; 2247 src += 4; 2248 dst += 4; 2249 } 2250 2251 while (w) 2252 { 2253 s = *src++; 2254 *dst = convert_8888_to_0565 (s); 2255 dst++; 2256 w--; 2257 } 2258 } 2259 2260 _mm_empty (); 2261 } 2262 2263 static void 2264 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, 2265 pixman_composite_info_t *info) 2266 { 2267 PIXMAN_COMPOSITE_ARGS (info); 2268 uint32_t src, srca; 2269 uint32_t *dst_line, *dst; 2270 uint8_t *mask_line, *mask; 2271 int dst_stride, mask_stride; 2272 int32_t w; 2273 __m64 vsrc; 2274 uint64_t srcsrc; 2275 2276 CHECKPOINT (); 2277 2278 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2279 2280 srca = src >> 24; 2281 if (src == 0) 2282 { 2283 mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, 2284 PIXMAN_FORMAT_BPP (dest_image->bits.format), 2285 dest_x, dest_y, width, height, 0); 2286 return; 2287 } 2288 2289 srcsrc = (uint64_t)src << 32 | src; 2290 2291 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2292 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 2293 2294 vsrc = load8888 (&src); 2295 2296 while (height--) 2297 { 2298 dst = dst_line; 2299 dst_line += dst_stride; 2300 mask = mask_line; 2301 mask_line += mask_stride; 2302 w = width; 2303 2304 CHECKPOINT (); 2305 2306 while (w && (uintptr_t)dst & 7) 2307 { 2308 uint64_t m = *mask; 2309 2310 if (m) 2311 { 2312 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); 2313 2314 store8888 (dst, vdest); 2315 } 2316 else 2317 { 2318 *dst = 0; 2319 } 2320 2321 w--; 2322 mask++; 2323 dst++; 2324 } 2325 2326 CHECKPOINT (); 2327 2328 while (w >= 2) 2329 { 2330 uint64_t m0, m1; 2331 m0 = *mask; 2332 m1 = *(mask + 1); 2333 2334 if (srca == 0xff && (m0 & m1) == 0xff) 2335 { 2336 *(uint64_t *)dst = srcsrc; 2337 } 2338 else if (m0 | m1) 2339 { 2340 __m64 dest0, dest1; 2341 2342 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0))); 2343 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1))); 2344 2345 *(__m64 *)dst = pack8888 (dest0, dest1); 2346 } 2347 else 2348 { 2349 *(uint64_t *)dst = 0; 2350 } 2351 2352 mask += 2; 2353 dst += 2; 2354 w -= 2; 2355 } 2356 2357 CHECKPOINT (); 2358 2359 if (w) 2360 { 2361 uint64_t m = *mask; 2362 2363 if (m) 2364 { 2365 __m64 vdest = load8888 (dst); 2366 2367 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); 2368 store8888 (dst, vdest); 2369 } 2370 else 2371 { 2372 *dst = 0; 2373 } 2374 } 2375 } 2376 2377 _mm_empty (); 2378 } 2379 2380 static void 2381 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, 2382 pixman_composite_info_t *info) 2383 { 2384 PIXMAN_COMPOSITE_ARGS (info); 2385 uint32_t src, srca; 2386 uint16_t *dst_line, *dst; 2387 uint8_t *mask_line, *mask; 2388 int dst_stride, mask_stride; 2389 int32_t w; 2390 __m64 vsrc, vsrca, tmp; 2391 __m64 srcsrcsrcsrc; 2392 2393 CHECKPOINT (); 2394 2395 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2396 2397 srca = src >> 24; 2398 if (src == 0) 2399 return; 2400 2401 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2402 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 2403 2404 vsrc = load8888 (&src); 2405 vsrca = expand_alpha (vsrc); 2406 2407 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); 2408 srcsrcsrcsrc = expand_alpha_rev (tmp); 2409 2410 while (height--) 2411 { 2412 dst = dst_line; 2413 dst_line += dst_stride; 2414 mask = mask_line; 2415 mask_line += mask_stride; 2416 w = width; 2417 2418 CHECKPOINT (); 2419 2420 while (w && (uintptr_t)dst & 7) 2421 { 2422 uint64_t m = *mask; 2423 2424 if (m) 2425 { 2426 uint64_t d = *dst; 2427 __m64 vd = to_m64 (d); 2428 __m64 vdest = in_over ( 2429 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0)); 2430 2431 vd = pack_565 (vdest, _mm_setzero_si64 (), 0); 2432 *dst = to_uint64 (vd); 2433 } 2434 2435 w--; 2436 mask++; 2437 dst++; 2438 } 2439 2440 CHECKPOINT (); 2441 2442 while (w >= 4) 2443 { 2444 uint64_t m0, m1, m2, m3; 2445 m0 = *mask; 2446 m1 = *(mask + 1); 2447 m2 = *(mask + 2); 2448 m3 = *(mask + 3); 2449 2450 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) 2451 { 2452 *(__m64 *)dst = srcsrcsrcsrc; 2453 } 2454 else if (m0 | m1 | m2 | m3) 2455 { 2456 __m64 vdest = *(__m64 *)dst; 2457 __m64 v0, v1, v2, v3; 2458 __m64 vm0, vm1, vm2, vm3; 2459 2460 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 2461 2462 vm0 = to_m64 (m0); 2463 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0); 2464 2465 vm1 = to_m64 (m1); 2466 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1); 2467 2468 vm2 = to_m64 (m2); 2469 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2); 2470 2471 vm3 = to_m64 (m3); 2472 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3); 2473 2474 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; 2475 } 2476 2477 w -= 4; 2478 mask += 4; 2479 dst += 4; 2480 } 2481 2482 CHECKPOINT (); 2483 2484 while (w) 2485 { 2486 uint64_t m = *mask; 2487 2488 if (m) 2489 { 2490 uint64_t d = *dst; 2491 __m64 vd = to_m64 (d); 2492 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), 2493 expand565 (vd, 0)); 2494 vd = pack_565 (vdest, _mm_setzero_si64 (), 0); 2495 *dst = to_uint64 (vd); 2496 } 2497 2498 w--; 2499 mask++; 2500 dst++; 2501 } 2502 } 2503 2504 _mm_empty (); 2505 } 2506 2507 static void 2508 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, 2509 pixman_composite_info_t *info) 2510 { 2511 PIXMAN_COMPOSITE_ARGS (info); 2512 uint16_t *dst_line, *dst; 2513 uint32_t *src_line, *src; 2514 int dst_stride, src_stride; 2515 int32_t w; 2516 2517 CHECKPOINT (); 2518 2519 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2520 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2521 2522 #if 0 2523 /* FIXME */ 2524 assert (src_image->drawable == mask_image->drawable); 2525 #endif 2526 2527 while (height--) 2528 { 2529 dst = dst_line; 2530 dst_line += dst_stride; 2531 src = src_line; 2532 src_line += src_stride; 2533 w = width; 2534 2535 CHECKPOINT (); 2536 2537 while (w && (uintptr_t)dst & 7) 2538 { 2539 __m64 vsrc = load8888 (src); 2540 uint64_t d = *dst; 2541 __m64 vdest = expand565 (to_m64 (d), 0); 2542 2543 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); 2544 2545 *dst = to_uint64 (vdest); 2546 2547 w--; 2548 dst++; 2549 src++; 2550 } 2551 2552 CHECKPOINT (); 2553 2554 while (w >= 4) 2555 { 2556 uint32_t s0, s1, s2, s3; 2557 unsigned char a0, a1, a2, a3; 2558 2559 s0 = *src; 2560 s1 = *(src + 1); 2561 s2 = *(src + 2); 2562 s3 = *(src + 3); 2563 2564 a0 = (s0 >> 24); 2565 a1 = (s1 >> 24); 2566 a2 = (s2 >> 24); 2567 a3 = (s3 >> 24); 2568 2569 if ((a0 & a1 & a2 & a3) == 0xFF) 2570 { 2571 __m64 v0 = invert_colors (load8888 (&s0)); 2572 __m64 v1 = invert_colors (load8888 (&s1)); 2573 __m64 v2 = invert_colors (load8888 (&s2)); 2574 __m64 v3 = invert_colors (load8888 (&s3)); 2575 2576 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 2577 } 2578 else if (s0 | s1 | s2 | s3) 2579 { 2580 __m64 vdest = *(__m64 *)dst; 2581 __m64 v0, v1, v2, v3; 2582 2583 __m64 vsrc0 = load8888 (&s0); 2584 __m64 vsrc1 = load8888 (&s1); 2585 __m64 vsrc2 = load8888 (&s2); 2586 __m64 vsrc3 = load8888 (&s3); 2587 2588 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 2589 2590 v0 = over_rev_non_pre (vsrc0, v0); 2591 v1 = over_rev_non_pre (vsrc1, v1); 2592 v2 = over_rev_non_pre (vsrc2, v2); 2593 v3 = over_rev_non_pre (vsrc3, v3); 2594 2595 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 2596 } 2597 2598 w -= 4; 2599 dst += 4; 2600 src += 4; 2601 } 2602 2603 CHECKPOINT (); 2604 2605 while (w) 2606 { 2607 __m64 vsrc = load8888 (src); 2608 uint64_t d = *dst; 2609 __m64 vdest = expand565 (to_m64 (d), 0); 2610 2611 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); 2612 2613 *dst = to_uint64 (vdest); 2614 2615 w--; 2616 dst++; 2617 src++; 2618 } 2619 } 2620 2621 _mm_empty (); 2622 } 2623 2624 static void 2625 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, 2626 pixman_composite_info_t *info) 2627 { 2628 PIXMAN_COMPOSITE_ARGS (info); 2629 uint32_t *dst_line, *dst; 2630 uint32_t *src_line, *src; 2631 int dst_stride, src_stride; 2632 int32_t w; 2633 2634 CHECKPOINT (); 2635 2636 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2637 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2638 2639 #if 0 2640 /* FIXME */ 2641 assert (src_image->drawable == mask_image->drawable); 2642 #endif 2643 2644 while (height--) 2645 { 2646 dst = dst_line; 2647 dst_line += dst_stride; 2648 src = src_line; 2649 src_line += src_stride; 2650 w = width; 2651 2652 while (w && (uintptr_t)dst & 7) 2653 { 2654 __m64 s = load8888 (src); 2655 __m64 d = load8888 (dst); 2656 2657 store8888 (dst, over_rev_non_pre (s, d)); 2658 2659 w--; 2660 dst++; 2661 src++; 2662 } 2663 2664 while (w >= 2) 2665 { 2666 uint32_t s0, s1; 2667 unsigned char a0, a1; 2668 __m64 d0, d1; 2669 2670 s0 = *src; 2671 s1 = *(src + 1); 2672 2673 a0 = (s0 >> 24); 2674 a1 = (s1 >> 24); 2675 2676 if ((a0 & a1) == 0xFF) 2677 { 2678 d0 = invert_colors (load8888 (&s0)); 2679 d1 = invert_colors (load8888 (&s1)); 2680 2681 *(__m64 *)dst = pack8888 (d0, d1); 2682 } 2683 else if (s0 | s1) 2684 { 2685 __m64 vdest = *(__m64 *)dst; 2686 2687 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); 2688 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); 2689 2690 *(__m64 *)dst = pack8888 (d0, d1); 2691 } 2692 2693 w -= 2; 2694 dst += 2; 2695 src += 2; 2696 } 2697 2698 if (w) 2699 { 2700 __m64 s = load8888 (src); 2701 __m64 d = load8888 (dst); 2702 2703 store8888 (dst, over_rev_non_pre (s, d)); 2704 } 2705 } 2706 2707 _mm_empty (); 2708 } 2709 2710 static void 2711 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, 2712 pixman_composite_info_t *info) 2713 { 2714 PIXMAN_COMPOSITE_ARGS (info); 2715 uint32_t src; 2716 uint16_t *dst_line; 2717 uint32_t *mask_line; 2718 int dst_stride, mask_stride; 2719 __m64 vsrc, vsrca; 2720 2721 CHECKPOINT (); 2722 2723 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2724 2725 if (src == 0) 2726 return; 2727 2728 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2729 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 2730 2731 vsrc = load8888 (&src); 2732 vsrca = expand_alpha (vsrc); 2733 2734 while (height--) 2735 { 2736 int twidth = width; 2737 uint32_t *p = (uint32_t *)mask_line; 2738 uint16_t *q = (uint16_t *)dst_line; 2739 2740 while (twidth && ((uintptr_t)q & 7)) 2741 { 2742 uint32_t m = *(uint32_t *)p; 2743 2744 if (m) 2745 { 2746 uint64_t d = *q; 2747 __m64 vdest = expand565 (to_m64 (d), 0); 2748 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); 2749 *q = to_uint64 (vdest); 2750 } 2751 2752 twidth--; 2753 p++; 2754 q++; 2755 } 2756 2757 while (twidth >= 4) 2758 { 2759 uint32_t m0, m1, m2, m3; 2760 2761 m0 = *p; 2762 m1 = *(p + 1); 2763 m2 = *(p + 2); 2764 m3 = *(p + 3); 2765 2766 if ((m0 | m1 | m2 | m3)) 2767 { 2768 __m64 vdest = *(__m64 *)q; 2769 __m64 v0, v1, v2, v3; 2770 2771 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 2772 2773 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0); 2774 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1); 2775 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2); 2776 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3); 2777 2778 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3); 2779 } 2780 twidth -= 4; 2781 p += 4; 2782 q += 4; 2783 } 2784 2785 while (twidth) 2786 { 2787 uint32_t m; 2788 2789 m = *(uint32_t *)p; 2790 if (m) 2791 { 2792 uint64_t d = *q; 2793 __m64 vdest = expand565 (to_m64 (d), 0); 2794 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); 2795 *q = to_uint64 (vdest); 2796 } 2797 2798 twidth--; 2799 p++; 2800 q++; 2801 } 2802 2803 mask_line += mask_stride; 2804 dst_line += dst_stride; 2805 } 2806 2807 _mm_empty (); 2808 } 2809 2810 static void 2811 mmx_composite_in_n_8_8 (pixman_implementation_t *imp, 2812 pixman_composite_info_t *info) 2813 { 2814 PIXMAN_COMPOSITE_ARGS (info); 2815 uint8_t *dst_line, *dst; 2816 uint8_t *mask_line, *mask; 2817 int dst_stride, mask_stride; 2818 int32_t w; 2819 uint32_t src; 2820 uint8_t sa; 2821 __m64 vsrc, vsrca; 2822 2823 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 2824 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 2825 2826 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2827 2828 sa = src >> 24; 2829 2830 vsrc = load8888 (&src); 2831 vsrca = expand_alpha (vsrc); 2832 2833 while (height--) 2834 { 2835 dst = dst_line; 2836 dst_line += dst_stride; 2837 mask = mask_line; 2838 mask_line += mask_stride; 2839 w = width; 2840 2841 while (w && (uintptr_t)dst & 7) 2842 { 2843 uint16_t tmp; 2844 uint8_t a; 2845 uint32_t m, d; 2846 2847 a = *mask++; 2848 d = *dst; 2849 2850 m = MUL_UN8 (sa, a, tmp); 2851 d = MUL_UN8 (m, d, tmp); 2852 2853 *dst++ = d; 2854 w--; 2855 } 2856 2857 while (w >= 4) 2858 { 2859 __m64 vmask; 2860 __m64 vdest; 2861 2862 vmask = load8888u ((uint32_t *)mask); 2863 vdest = load8888 ((uint32_t *)dst); 2864 2865 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest)); 2866 2867 dst += 4; 2868 mask += 4; 2869 w -= 4; 2870 } 2871 2872 while (w--) 2873 { 2874 uint16_t tmp; 2875 uint8_t a; 2876 uint32_t m, d; 2877 2878 a = *mask++; 2879 d = *dst; 2880 2881 m = MUL_UN8 (sa, a, tmp); 2882 d = MUL_UN8 (m, d, tmp); 2883 2884 *dst++ = d; 2885 } 2886 } 2887 2888 _mm_empty (); 2889 } 2890 2891 static void 2892 mmx_composite_in_8_8 (pixman_implementation_t *imp, 2893 pixman_composite_info_t *info) 2894 { 2895 PIXMAN_COMPOSITE_ARGS (info); 2896 uint8_t *dst_line, *dst; 2897 uint8_t *src_line, *src; 2898 int src_stride, dst_stride; 2899 int32_t w; 2900 2901 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 2902 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 2903 2904 while (height--) 2905 { 2906 dst = dst_line; 2907 dst_line += dst_stride; 2908 src = src_line; 2909 src_line += src_stride; 2910 w = width; 2911 2912 while (w && (uintptr_t)dst & 3) 2913 { 2914 uint8_t s, d; 2915 uint16_t tmp; 2916 2917 s = *src; 2918 d = *dst; 2919 2920 *dst = MUL_UN8 (s, d, tmp); 2921 2922 src++; 2923 dst++; 2924 w--; 2925 } 2926 2927 while (w >= 4) 2928 { 2929 uint32_t *s = (uint32_t *)src; 2930 uint32_t *d = (uint32_t *)dst; 2931 2932 store8888 (d, in (load8888u (s), load8888 (d))); 2933 2934 w -= 4; 2935 dst += 4; 2936 src += 4; 2937 } 2938 2939 while (w--) 2940 { 2941 uint8_t s, d; 2942 uint16_t tmp; 2943 2944 s = *src; 2945 d = *dst; 2946 2947 *dst = MUL_UN8 (s, d, tmp); 2948 2949 src++; 2950 dst++; 2951 } 2952 } 2953 2954 _mm_empty (); 2955 } 2956 2957 static void 2958 mmx_composite_add_n_8_8 (pixman_implementation_t *imp, 2959 pixman_composite_info_t *info) 2960 { 2961 PIXMAN_COMPOSITE_ARGS (info); 2962 uint8_t *dst_line, *dst; 2963 uint8_t *mask_line, *mask; 2964 int dst_stride, mask_stride; 2965 int32_t w; 2966 uint32_t src; 2967 uint8_t sa; 2968 __m64 vsrc, vsrca; 2969 2970 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 2971 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 2972 2973 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2974 2975 sa = src >> 24; 2976 2977 if (src == 0) 2978 return; 2979 2980 vsrc = load8888 (&src); 2981 vsrca = expand_alpha (vsrc); 2982 2983 while (height--) 2984 { 2985 dst = dst_line; 2986 dst_line += dst_stride; 2987 mask = mask_line; 2988 mask_line += mask_stride; 2989 w = width; 2990 2991 while (w && (uintptr_t)dst & 3) 2992 { 2993 uint16_t tmp; 2994 uint16_t a; 2995 uint32_t m, d; 2996 uint32_t r; 2997 2998 a = *mask++; 2999 d = *dst; 3000 3001 m = MUL_UN8 (sa, a, tmp); 3002 r = ADD_UN8 (m, d, tmp); 3003 3004 *dst++ = r; 3005 w--; 3006 } 3007 3008 while (w >= 4) 3009 { 3010 __m64 vmask; 3011 __m64 vdest; 3012 3013 vmask = load8888u ((uint32_t *)mask); 3014 vdest = load8888 ((uint32_t *)dst); 3015 3016 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest)); 3017 3018 dst += 4; 3019 mask += 4; 3020 w -= 4; 3021 } 3022 3023 while (w--) 3024 { 3025 uint16_t tmp; 3026 uint16_t a; 3027 uint32_t m, d; 3028 uint32_t r; 3029 3030 a = *mask++; 3031 d = *dst; 3032 3033 m = MUL_UN8 (sa, a, tmp); 3034 r = ADD_UN8 (m, d, tmp); 3035 3036 *dst++ = r; 3037 } 3038 } 3039 3040 _mm_empty (); 3041 } 3042 3043 static void 3044 mmx_composite_add_8_8 (pixman_implementation_t *imp, 3045 pixman_composite_info_t *info) 3046 { 3047 PIXMAN_COMPOSITE_ARGS (info); 3048 uint8_t *dst_line, *dst; 3049 uint8_t *src_line, *src; 3050 int dst_stride, src_stride; 3051 int32_t w; 3052 uint8_t s, d; 3053 uint16_t t; 3054 3055 CHECKPOINT (); 3056 3057 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 3058 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 3059 3060 while (height--) 3061 { 3062 dst = dst_line; 3063 dst_line += dst_stride; 3064 src = src_line; 3065 src_line += src_stride; 3066 w = width; 3067 3068 while (w && (uintptr_t)dst & 7) 3069 { 3070 s = *src; 3071 d = *dst; 3072 t = d + s; 3073 s = t | (0 - (t >> 8)); 3074 *dst = s; 3075 3076 dst++; 3077 src++; 3078 w--; 3079 } 3080 3081 while (w >= 8) 3082 { 3083 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); 3084 dst += 8; 3085 src += 8; 3086 w -= 8; 3087 } 3088 3089 while (w) 3090 { 3091 s = *src; 3092 d = *dst; 3093 t = d + s; 3094 s = t | (0 - (t >> 8)); 3095 *dst = s; 3096 3097 dst++; 3098 src++; 3099 w--; 3100 } 3101 } 3102 3103 _mm_empty (); 3104 } 3105 3106 static void 3107 mmx_composite_add_0565_0565 (pixman_implementation_t *imp, 3108 pixman_composite_info_t *info) 3109 { 3110 PIXMAN_COMPOSITE_ARGS (info); 3111 uint16_t *dst_line, *dst; 3112 uint32_t d; 3113 uint16_t *src_line, *src; 3114 uint32_t s; 3115 int dst_stride, src_stride; 3116 int32_t w; 3117 3118 CHECKPOINT (); 3119 3120 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1); 3121 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 3122 3123 while (height--) 3124 { 3125 dst = dst_line; 3126 dst_line += dst_stride; 3127 src = src_line; 3128 src_line += src_stride; 3129 w = width; 3130 3131 while (w && (uintptr_t)dst & 7) 3132 { 3133 s = *src++; 3134 if (s) 3135 { 3136 d = *dst; 3137 s = convert_0565_to_8888 (s); 3138 if (d) 3139 { 3140 d = convert_0565_to_8888 (d); 3141 UN8x4_ADD_UN8x4 (s, d); 3142 } 3143 *dst = convert_8888_to_0565 (s); 3144 } 3145 dst++; 3146 w--; 3147 } 3148 3149 while (w >= 4) 3150 { 3151 __m64 vdest = *(__m64 *)dst; 3152 __m64 vsrc = ldq_u ((__m64 *)src); 3153 __m64 vd0, vd1; 3154 __m64 vs0, vs1; 3155 3156 expand_4xpacked565 (vdest, &vd0, &vd1, 0); 3157 expand_4xpacked565 (vsrc, &vs0, &vs1, 0); 3158 3159 vd0 = _mm_adds_pu8 (vd0, vs0); 3160 vd1 = _mm_adds_pu8 (vd1, vs1); 3161 3162 *(__m64 *)dst = pack_4xpacked565 (vd0, vd1); 3163 3164 dst += 4; 3165 src += 4; 3166 w -= 4; 3167 } 3168 3169 while (w--) 3170 { 3171 s = *src++; 3172 if (s) 3173 { 3174 d = *dst; 3175 s = convert_0565_to_8888 (s); 3176 if (d) 3177 { 3178 d = convert_0565_to_8888 (d); 3179 UN8x4_ADD_UN8x4 (s, d); 3180 } 3181 *dst = convert_8888_to_0565 (s); 3182 } 3183 dst++; 3184 } 3185 } 3186 3187 _mm_empty (); 3188 } 3189 3190 static void 3191 mmx_composite_add_8888_8888 (pixman_implementation_t *imp, 3192 pixman_composite_info_t *info) 3193 { 3194 PIXMAN_COMPOSITE_ARGS (info); 3195 uint32_t *dst_line, *dst; 3196 uint32_t *src_line, *src; 3197 int dst_stride, src_stride; 3198 int32_t w; 3199 3200 CHECKPOINT (); 3201 3202 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3203 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3204 3205 while (height--) 3206 { 3207 dst = dst_line; 3208 dst_line += dst_stride; 3209 src = src_line; 3210 src_line += src_stride; 3211 w = width; 3212 3213 while (w && (uintptr_t)dst & 7) 3214 { 3215 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), 3216 load ((const uint32_t *)dst))); 3217 dst++; 3218 src++; 3219 w--; 3220 } 3221 3222 while (w >= 2) 3223 { 3224 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); 3225 dst += 2; 3226 src += 2; 3227 w -= 2; 3228 } 3229 3230 if (w) 3231 { 3232 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), 3233 load ((const uint32_t *)dst))); 3234 3235 } 3236 } 3237 3238 _mm_empty (); 3239 } 3240 3241 static pixman_bool_t 3242 mmx_blt (pixman_implementation_t *imp, 3243 uint32_t * src_bits, 3244 uint32_t * dst_bits, 3245 int src_stride, 3246 int dst_stride, 3247 int src_bpp, 3248 int dst_bpp, 3249 int src_x, 3250 int src_y, 3251 int dest_x, 3252 int dest_y, 3253 int width, 3254 int height) 3255 { 3256 uint8_t * src_bytes; 3257 uint8_t * dst_bytes; 3258 int byte_width; 3259 3260 if (src_bpp != dst_bpp) 3261 return FALSE; 3262 3263 if (src_bpp == 16) 3264 { 3265 src_stride = src_stride * (int) sizeof (uint32_t) / 2; 3266 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; 3267 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); 3268 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 3269 byte_width = 2 * width; 3270 src_stride *= 2; 3271 dst_stride *= 2; 3272 } 3273 else if (src_bpp == 32) 3274 { 3275 src_stride = src_stride * (int) sizeof (uint32_t) / 4; 3276 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; 3277 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); 3278 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 3279 byte_width = 4 * width; 3280 src_stride *= 4; 3281 dst_stride *= 4; 3282 } 3283 else 3284 { 3285 return FALSE; 3286 } 3287 3288 while (height--) 3289 { 3290 int w; 3291 uint8_t *s = src_bytes; 3292 uint8_t *d = dst_bytes; 3293 src_bytes += src_stride; 3294 dst_bytes += dst_stride; 3295 w = byte_width; 3296 3297 if (w >= 1 && ((uintptr_t)d & 1)) 3298 { 3299 *(uint8_t *)d = *(uint8_t *)s; 3300 w -= 1; 3301 s += 1; 3302 d += 1; 3303 } 3304 3305 if (w >= 2 && ((uintptr_t)d & 3)) 3306 { 3307 *(uint16_t *)d = *(uint16_t *)s; 3308 w -= 2; 3309 s += 2; 3310 d += 2; 3311 } 3312 3313 while (w >= 4 && ((uintptr_t)d & 7)) 3314 { 3315 *(uint32_t *)d = ldl_u ((uint32_t *)s); 3316 3317 w -= 4; 3318 s += 4; 3319 d += 4; 3320 } 3321 3322 while (w >= 64) 3323 { 3324 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX 3325 __asm__ ( 3326 "movq (%1), %%mm0\n" 3327 "movq 8(%1), %%mm1\n" 3328 "movq 16(%1), %%mm2\n" 3329 "movq 24(%1), %%mm3\n" 3330 "movq 32(%1), %%mm4\n" 3331 "movq 40(%1), %%mm5\n" 3332 "movq 48(%1), %%mm6\n" 3333 "movq 56(%1), %%mm7\n" 3334 3335 "movq %%mm0, (%0)\n" 3336 "movq %%mm1, 8(%0)\n" 3337 "movq %%mm2, 16(%0)\n" 3338 "movq %%mm3, 24(%0)\n" 3339 "movq %%mm4, 32(%0)\n" 3340 "movq %%mm5, 40(%0)\n" 3341 "movq %%mm6, 48(%0)\n" 3342 "movq %%mm7, 56(%0)\n" 3343 : 3344 : "r" (d), "r" (s) 3345 : "memory", 3346 "%mm0", "%mm1", "%mm2", "%mm3", 3347 "%mm4", "%mm5", "%mm6", "%mm7"); 3348 #else 3349 __m64 v0 = ldq_u ((__m64 *)(s + 0)); 3350 __m64 v1 = ldq_u ((__m64 *)(s + 8)); 3351 __m64 v2 = ldq_u ((__m64 *)(s + 16)); 3352 __m64 v3 = ldq_u ((__m64 *)(s + 24)); 3353 __m64 v4 = ldq_u ((__m64 *)(s + 32)); 3354 __m64 v5 = ldq_u ((__m64 *)(s + 40)); 3355 __m64 v6 = ldq_u ((__m64 *)(s + 48)); 3356 __m64 v7 = ldq_u ((__m64 *)(s + 56)); 3357 *(__m64 *)(d + 0) = v0; 3358 *(__m64 *)(d + 8) = v1; 3359 *(__m64 *)(d + 16) = v2; 3360 *(__m64 *)(d + 24) = v3; 3361 *(__m64 *)(d + 32) = v4; 3362 *(__m64 *)(d + 40) = v5; 3363 *(__m64 *)(d + 48) = v6; 3364 *(__m64 *)(d + 56) = v7; 3365 #endif 3366 3367 w -= 64; 3368 s += 64; 3369 d += 64; 3370 } 3371 while (w >= 4) 3372 { 3373 *(uint32_t *)d = ldl_u ((uint32_t *)s); 3374 3375 w -= 4; 3376 s += 4; 3377 d += 4; 3378 } 3379 if (w >= 2) 3380 { 3381 *(uint16_t *)d = *(uint16_t *)s; 3382 w -= 2; 3383 s += 2; 3384 d += 2; 3385 } 3386 } 3387 3388 _mm_empty (); 3389 3390 return TRUE; 3391 } 3392 3393 static void 3394 mmx_composite_copy_area (pixman_implementation_t *imp, 3395 pixman_composite_info_t *info) 3396 { 3397 PIXMAN_COMPOSITE_ARGS (info); 3398 3399 mmx_blt (imp, src_image->bits.bits, 3400 dest_image->bits.bits, 3401 src_image->bits.rowstride, 3402 dest_image->bits.rowstride, 3403 PIXMAN_FORMAT_BPP (src_image->bits.format), 3404 PIXMAN_FORMAT_BPP (dest_image->bits.format), 3405 src_x, src_y, dest_x, dest_y, width, height); 3406 } 3407 3408 static void 3409 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, 3410 pixman_composite_info_t *info) 3411 { 3412 PIXMAN_COMPOSITE_ARGS (info); 3413 uint32_t *src, *src_line; 3414 uint32_t *dst, *dst_line; 3415 uint8_t *mask, *mask_line; 3416 int src_stride, mask_stride, dst_stride; 3417 int32_t w; 3418 3419 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3420 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 3421 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3422 3423 while (height--) 3424 { 3425 src = src_line; 3426 src_line += src_stride; 3427 dst = dst_line; 3428 dst_line += dst_stride; 3429 mask = mask_line; 3430 mask_line += mask_stride; 3431 3432 w = width; 3433 3434 while (w--) 3435 { 3436 uint64_t m = *mask; 3437 3438 if (m) 3439 { 3440 uint32_t ssrc = *src | 0xff000000; 3441 __m64 s = load8888 (&ssrc); 3442 3443 if (m == 0xff) 3444 { 3445 store8888 (dst, s); 3446 } 3447 else 3448 { 3449 __m64 sa = expand_alpha (s); 3450 __m64 vm = expand_alpha_rev (to_m64 (m)); 3451 __m64 vdest = in_over (s, sa, vm, load8888 (dst)); 3452 3453 store8888 (dst, vdest); 3454 } 3455 } 3456 3457 mask++; 3458 dst++; 3459 src++; 3460 } 3461 } 3462 3463 _mm_empty (); 3464 } 3465 3466 static void 3467 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, 3468 pixman_composite_info_t *info) 3469 { 3470 PIXMAN_COMPOSITE_ARGS (info); 3471 uint32_t src; 3472 uint32_t *dst_line, *dst; 3473 int32_t w; 3474 int dst_stride; 3475 __m64 vsrc; 3476 3477 CHECKPOINT (); 3478 3479 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 3480 3481 if (src == 0) 3482 return; 3483 3484 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3485 3486 vsrc = load8888 (&src); 3487 3488 while (height--) 3489 { 3490 dst = dst_line; 3491 dst_line += dst_stride; 3492 w = width; 3493 3494 CHECKPOINT (); 3495 3496 while (w && (uintptr_t)dst & 7) 3497 { 3498 __m64 vdest = load8888 (dst); 3499 3500 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); 3501 3502 w--; 3503 dst++; 3504 } 3505 3506 while (w >= 2) 3507 { 3508 __m64 vdest = *(__m64 *)dst; 3509 __m64 dest0 = expand8888 (vdest, 0); 3510 __m64 dest1 = expand8888 (vdest, 1); 3511 3512 3513 dest0 = over (dest0, expand_alpha (dest0), vsrc); 3514 dest1 = over (dest1, expand_alpha (dest1), vsrc); 3515 3516 *(__m64 *)dst = pack8888 (dest0, dest1); 3517 3518 dst += 2; 3519 w -= 2; 3520 } 3521 3522 CHECKPOINT (); 3523 3524 if (w) 3525 { 3526 __m64 vdest = load8888 (dst); 3527 3528 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); 3529 } 3530 } 3531 3532 _mm_empty (); 3533 } 3534 3535 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) 3536 #define BMSK (BSHIFT - 1) 3537 3538 #define BILINEAR_DECLARE_VARIABLES \ 3539 const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \ 3540 const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \ 3541 const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \ 3542 const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \ 3543 const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \ 3544 const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \ 3545 const __m64 mm_zero = _mm_setzero_si64 (); \ 3546 __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) 3547 3548 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ 3549 do { \ 3550 /* fetch 2x2 pixel block into 2 mmx registers */ \ 3551 __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \ 3552 __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \ 3553 /* vertical interpolation */ \ 3554 __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \ 3555 __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \ 3556 __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \ 3557 __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ 3558 __m64 hi = _mm_add_pi16 (t_hi, b_hi); \ 3559 __m64 lo = _mm_add_pi16 (t_lo, b_lo); \ 3560 vx += unit_x; \ 3561 if (BILINEAR_INTERPOLATION_BITS < 8) \ 3562 { \ 3563 /* calculate horizontal weights */ \ 3564 __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \ 3565 _mm_srli_pi16 (mm_x, \ 3566 16 - BILINEAR_INTERPOLATION_BITS))); \ 3567 /* horizontal interpolation */ \ 3568 __m64 p = _mm_unpacklo_pi16 (lo, hi); \ 3569 __m64 q = _mm_unpackhi_pi16 (lo, hi); \ 3570 lo = _mm_madd_pi16 (p, mm_wh); \ 3571 hi = _mm_madd_pi16 (q, mm_wh); \ 3572 } \ 3573 else \ 3574 { \ 3575 /* calculate horizontal weights */ \ 3576 __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \ 3577 16 - BILINEAR_INTERPOLATION_BITS)); \ 3578 __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \ 3579 16 - BILINEAR_INTERPOLATION_BITS); \ 3580 /* horizontal interpolation */ \ 3581 __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \ 3582 __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \ 3583 __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \ 3584 __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \ 3585 lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \ 3586 _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \ 3587 hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \ 3588 _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \ 3589 } \ 3590 mm_x = _mm_add_pi16 (mm_x, mm_ux); \ 3591 /* shift and pack the result */ \ 3592 hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \ 3593 lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \ 3594 lo = _mm_packs_pi32 (lo, hi); \ 3595 lo = _mm_packs_pu16 (lo, lo); \ 3596 pix = lo; \ 3597 } while (0) 3598 3599 #define BILINEAR_SKIP_ONE_PIXEL() \ 3600 do { \ 3601 vx += unit_x; \ 3602 mm_x = _mm_add_pi16 (mm_x, mm_ux); \ 3603 } while(0) 3604 3605 static force_inline void 3606 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst, 3607 const uint32_t * mask, 3608 const uint32_t * src_top, 3609 const uint32_t * src_bottom, 3610 int32_t w, 3611 int wt, 3612 int wb, 3613 pixman_fixed_t vx, 3614 pixman_fixed_t unit_x, 3615 pixman_fixed_t max_vx, 3616 pixman_bool_t zero_src) 3617 { 3618 BILINEAR_DECLARE_VARIABLES; 3619 __m64 pix; 3620 3621 while (w--) 3622 { 3623 BILINEAR_INTERPOLATE_ONE_PIXEL (pix); 3624 store (dst, pix); 3625 dst++; 3626 } 3627 3628 _mm_empty (); 3629 } 3630 3631 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC, 3632 scaled_bilinear_scanline_mmx_8888_8888_SRC, 3633 uint32_t, uint32_t, uint32_t, 3634 COVER, FLAG_NONE) 3635 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC, 3636 scaled_bilinear_scanline_mmx_8888_8888_SRC, 3637 uint32_t, uint32_t, uint32_t, 3638 PAD, FLAG_NONE) 3639 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC, 3640 scaled_bilinear_scanline_mmx_8888_8888_SRC, 3641 uint32_t, uint32_t, uint32_t, 3642 NONE, FLAG_NONE) 3643 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC, 3644 scaled_bilinear_scanline_mmx_8888_8888_SRC, 3645 uint32_t, uint32_t, uint32_t, 3646 NORMAL, FLAG_NONE) 3647 3648 static force_inline void 3649 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst, 3650 const uint32_t * mask, 3651 const uint32_t * src_top, 3652 const uint32_t * src_bottom, 3653 int32_t w, 3654 int wt, 3655 int wb, 3656 pixman_fixed_t vx, 3657 pixman_fixed_t unit_x, 3658 pixman_fixed_t max_vx, 3659 pixman_bool_t zero_src) 3660 { 3661 BILINEAR_DECLARE_VARIABLES; 3662 __m64 pix1, pix2; 3663 3664 while (w) 3665 { 3666 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 3667 3668 if (!is_zero (pix1)) 3669 { 3670 pix2 = load (dst); 3671 store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2)); 3672 } 3673 3674 w--; 3675 dst++; 3676 } 3677 3678 _mm_empty (); 3679 } 3680 3681 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER, 3682 scaled_bilinear_scanline_mmx_8888_8888_OVER, 3683 uint32_t, uint32_t, uint32_t, 3684 COVER, FLAG_NONE) 3685 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER, 3686 scaled_bilinear_scanline_mmx_8888_8888_OVER, 3687 uint32_t, uint32_t, uint32_t, 3688 PAD, FLAG_NONE) 3689 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER, 3690 scaled_bilinear_scanline_mmx_8888_8888_OVER, 3691 uint32_t, uint32_t, uint32_t, 3692 NONE, FLAG_NONE) 3693 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER, 3694 scaled_bilinear_scanline_mmx_8888_8888_OVER, 3695 uint32_t, uint32_t, uint32_t, 3696 NORMAL, FLAG_NONE) 3697 3698 static force_inline void 3699 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst, 3700 const uint8_t * mask, 3701 const uint32_t * src_top, 3702 const uint32_t * src_bottom, 3703 int32_t w, 3704 int wt, 3705 int wb, 3706 pixman_fixed_t vx, 3707 pixman_fixed_t unit_x, 3708 pixman_fixed_t max_vx, 3709 pixman_bool_t zero_src) 3710 { 3711 BILINEAR_DECLARE_VARIABLES; 3712 __m64 pix1, pix2; 3713 uint32_t m; 3714 3715 while (w) 3716 { 3717 m = (uint32_t) *mask++; 3718 3719 if (m) 3720 { 3721 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 3722 3723 if (m == 0xff && is_opaque (pix1)) 3724 { 3725 store (dst, pix1); 3726 } 3727 else 3728 { 3729 __m64 ms, md, ma, msa; 3730 3731 pix2 = load (dst); 3732 ma = expand_alpha_rev (to_m64 (m)); 3733 ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ()); 3734 md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ()); 3735 3736 msa = expand_alpha (ms); 3737 3738 store8888 (dst, (in_over (ms, msa, ma, md))); 3739 } 3740 } 3741 else 3742 { 3743 BILINEAR_SKIP_ONE_PIXEL (); 3744 } 3745 3746 w--; 3747 dst++; 3748 } 3749 3750 _mm_empty (); 3751 } 3752 3753 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER, 3754 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 3755 uint32_t, uint8_t, uint32_t, 3756 COVER, FLAG_HAVE_NON_SOLID_MASK) 3757 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER, 3758 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 3759 uint32_t, uint8_t, uint32_t, 3760 PAD, FLAG_HAVE_NON_SOLID_MASK) 3761 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER, 3762 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 3763 uint32_t, uint8_t, uint32_t, 3764 NONE, FLAG_HAVE_NON_SOLID_MASK) 3765 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER, 3766 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 3767 uint32_t, uint8_t, uint32_t, 3768 NORMAL, FLAG_HAVE_NON_SOLID_MASK) 3769 3770 static uint32_t * 3771 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) 3772 { 3773 int w = iter->width; 3774 uint32_t *dst = iter->buffer; 3775 uint32_t *src = (uint32_t *)iter->bits; 3776 3777 iter->bits += iter->stride; 3778 3779 while (w && ((uintptr_t)dst) & 7) 3780 { 3781 *dst++ = (*src++) | 0xff000000; 3782 w--; 3783 } 3784 3785 while (w >= 8) 3786 { 3787 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0)); 3788 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2)); 3789 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4)); 3790 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6)); 3791 3792 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000)); 3793 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000)); 3794 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000)); 3795 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000)); 3796 3797 dst += 8; 3798 src += 8; 3799 w -= 8; 3800 } 3801 3802 while (w) 3803 { 3804 *dst++ = (*src++) | 0xff000000; 3805 w--; 3806 } 3807 3808 _mm_empty (); 3809 return iter->buffer; 3810 } 3811 3812 static uint32_t * 3813 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) 3814 { 3815 int w = iter->width; 3816 uint32_t *dst = iter->buffer; 3817 uint16_t *src = (uint16_t *)iter->bits; 3818 3819 iter->bits += iter->stride; 3820 3821 while (w && ((uintptr_t)dst) & 0x0f) 3822 { 3823 uint16_t s = *src++; 3824 3825 *dst++ = convert_0565_to_8888 (s); 3826 w--; 3827 } 3828 3829 while (w >= 4) 3830 { 3831 __m64 vsrc = ldq_u ((__m64 *)src); 3832 __m64 mm0, mm1; 3833 3834 expand_4xpacked565 (vsrc, &mm0, &mm1, 1); 3835 3836 *(__m64 *)(dst + 0) = mm0; 3837 *(__m64 *)(dst + 2) = mm1; 3838 3839 dst += 4; 3840 src += 4; 3841 w -= 4; 3842 } 3843 3844 while (w) 3845 { 3846 uint16_t s = *src++; 3847 3848 *dst++ = convert_0565_to_8888 (s); 3849 w--; 3850 } 3851 3852 _mm_empty (); 3853 return iter->buffer; 3854 } 3855 3856 static uint32_t * 3857 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) 3858 { 3859 int w = iter->width; 3860 uint32_t *dst = iter->buffer; 3861 uint8_t *src = iter->bits; 3862 3863 iter->bits += iter->stride; 3864 3865 while (w && (((uintptr_t)dst) & 15)) 3866 { 3867 *dst++ = *(src++) << 24; 3868 w--; 3869 } 3870 3871 while (w >= 8) 3872 { 3873 __m64 mm0 = ldq_u ((__m64 *)src); 3874 3875 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0); 3876 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0); 3877 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1); 3878 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1); 3879 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2); 3880 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2); 3881 3882 *(__m64 *)(dst + 0) = mm3; 3883 *(__m64 *)(dst + 2) = mm4; 3884 *(__m64 *)(dst + 4) = mm5; 3885 *(__m64 *)(dst + 6) = mm6; 3886 3887 dst += 8; 3888 src += 8; 3889 w -= 8; 3890 } 3891 3892 while (w) 3893 { 3894 *dst++ = *(src++) << 24; 3895 w--; 3896 } 3897 3898 _mm_empty (); 3899 return iter->buffer; 3900 } 3901 3902 typedef struct 3903 { 3904 pixman_format_code_t format; 3905 pixman_iter_get_scanline_t get_scanline; 3906 } fetcher_info_t; 3907 3908 static const fetcher_info_t fetchers[] = 3909 { 3910 { PIXMAN_x8r8g8b8, mmx_fetch_x8r8g8b8 }, 3911 { PIXMAN_r5g6b5, mmx_fetch_r5g6b5 }, 3912 { PIXMAN_a8, mmx_fetch_a8 }, 3913 { PIXMAN_null } 3914 }; 3915 3916 static pixman_bool_t 3917 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) 3918 { 3919 pixman_image_t *image = iter->image; 3920 3921 #define FLAGS \ 3922 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ 3923 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) 3924 3925 if ((iter->iter_flags & ITER_NARROW) && 3926 (iter->image_flags & FLAGS) == FLAGS) 3927 { 3928 const fetcher_info_t *f; 3929 3930 for (f = &fetchers[0]; f->format != PIXMAN_null; f++) 3931 { 3932 if (image->common.extended_format_code == f->format) 3933 { 3934 uint8_t *b = (uint8_t *)image->bits.bits; 3935 int s = image->bits.rowstride * 4; 3936 3937 iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8; 3938 iter->stride = s; 3939 3940 iter->get_scanline = f->get_scanline; 3941 return TRUE; 3942 } 3943 } 3944 } 3945 3946 return FALSE; 3947 } 3948 3949 static const pixman_fast_path_t mmx_fast_paths[] = 3950 { 3951 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ), 3952 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ), 3953 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ), 3954 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ), 3955 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ), 3956 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ), 3957 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ), 3958 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ), 3959 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ), 3960 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ), 3961 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ), 3962 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ), 3963 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ), 3964 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ), 3965 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ), 3966 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ), 3967 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ), 3968 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ), 3969 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ), 3970 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ), 3971 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ), 3972 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ), 3973 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ), 3974 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ), 3975 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ), 3976 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ), 3977 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ), 3978 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ), 3979 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ), 3980 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ), 3981 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ), 3982 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ), 3983 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ), 3984 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ), 3985 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), 3986 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), 3987 3988 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ), 3989 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ), 3990 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ), 3991 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ), 3992 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ), 3993 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ), 3994 3995 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888), 3996 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888), 3997 3998 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ), 3999 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ), 4000 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ), 4001 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ), 4002 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), 4003 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), 4004 4005 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), 4006 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), 4007 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), 4008 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), 4009 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), 4010 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ), 4011 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ), 4012 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ), 4013 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ), 4014 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ), 4015 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), 4016 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), 4017 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), 4018 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), 4019 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ), 4020 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ), 4021 4022 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ), 4023 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ), 4024 4025 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), 4026 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), 4027 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ), 4028 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), 4029 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), 4030 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ), 4031 4032 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), 4033 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), 4034 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), 4035 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), 4036 4037 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ), 4038 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ), 4039 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ), 4040 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ), 4041 4042 { PIXMAN_OP_NONE }, 4043 }; 4044 4045 pixman_implementation_t * 4046 _pixman_implementation_create_mmx (pixman_implementation_t *fallback) 4047 { 4048 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths); 4049 4050 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; 4051 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; 4052 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; 4053 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; 4054 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; 4055 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u; 4056 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u; 4057 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u; 4058 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u; 4059 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u; 4060 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u; 4061 4062 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca; 4063 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca; 4064 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca; 4065 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca; 4066 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca; 4067 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca; 4068 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca; 4069 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca; 4070 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; 4071 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; 4072 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; 4073 4074 imp->blt = mmx_blt; 4075 imp->fill = mmx_fill; 4076 4077 imp->src_iter_init = mmx_src_iter_init; 4078 4079 return imp; 4080 } 4081 4082 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */ 4083