1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __XMMINTRIN_H 25 #define __XMMINTRIN_H 26 27 #ifndef __SSE__ 28 #error "SSE instruction set not enabled" 29 #else 30 31 #include <mmintrin.h> 32 33 typedef int __v4si __attribute__((__vector_size__(16))); 34 typedef float __v4sf __attribute__((__vector_size__(16))); 35 typedef float __m128 __attribute__((__vector_size__(16))); 36 37 // This header should only be included in a hosted environment as it depends on 38 // a standard library to provide allocation routines. 39 #if __STDC_HOSTED__ 40 #include <mm_malloc.h> 41 #endif 42 43 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 44 _mm_add_ss(__m128 a, __m128 b) 45 { 46 a[0] += b[0]; 47 return a; 48 } 49 50 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 51 _mm_add_ps(__m128 a, __m128 b) 52 { 53 return a + b; 54 } 55 56 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 57 _mm_sub_ss(__m128 a, __m128 b) 58 { 59 a[0] -= b[0]; 60 return a; 61 } 62 63 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 64 _mm_sub_ps(__m128 a, __m128 b) 65 { 66 return a - b; 67 } 68 69 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 70 _mm_mul_ss(__m128 a, __m128 b) 71 { 72 a[0] *= b[0]; 73 return a; 74 } 75 76 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 77 _mm_mul_ps(__m128 a, __m128 b) 78 { 79 return a * b; 80 } 81 82 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 83 _mm_div_ss(__m128 a, __m128 b) 84 { 85 a[0] /= b[0]; 86 return a; 87 } 88 89 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 90 _mm_div_ps(__m128 a, __m128 b) 91 { 92 return a / b; 93 } 94 95 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 96 _mm_sqrt_ss(__m128 a) 97 { 98 return __builtin_ia32_sqrtss(a); 99 } 100 101 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 102 _mm_sqrt_ps(__m128 a) 103 { 104 return __builtin_ia32_sqrtps(a); 105 } 106 107 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 108 _mm_rcp_ss(__m128 a) 109 { 110 return __builtin_ia32_rcpss(a); 111 } 112 113 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 114 _mm_rcp_ps(__m128 a) 115 { 116 return __builtin_ia32_rcpps(a); 117 } 118 119 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 120 _mm_rsqrt_ss(__m128 a) 121 { 122 return __builtin_ia32_rsqrtss(a); 123 } 124 125 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 126 _mm_rsqrt_ps(__m128 a) 127 { 128 return __builtin_ia32_rsqrtps(a); 129 } 130 131 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 132 _mm_min_ss(__m128 a, __m128 b) 133 { 134 return __builtin_ia32_minss(a, b); 135 } 136 137 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 138 _mm_min_ps(__m128 a, __m128 b) 139 { 140 return __builtin_ia32_minps(a, b); 141 } 142 143 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 144 _mm_max_ss(__m128 a, __m128 b) 145 { 146 return __builtin_ia32_maxss(a, b); 147 } 148 149 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 150 _mm_max_ps(__m128 a, __m128 b) 151 { 152 return __builtin_ia32_maxps(a, b); 153 } 154 155 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 156 _mm_and_ps(__m128 a, __m128 b) 157 { 158 return (__m128)((__v4si)a & (__v4si)b); 159 } 160 161 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 162 _mm_andnot_ps(__m128 a, __m128 b) 163 { 164 return (__m128)(~(__v4si)a & (__v4si)b); 165 } 166 167 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 168 _mm_or_ps(__m128 a, __m128 b) 169 { 170 return (__m128)((__v4si)a | (__v4si)b); 171 } 172 173 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 174 _mm_xor_ps(__m128 a, __m128 b) 175 { 176 return (__m128)((__v4si)a ^ (__v4si)b); 177 } 178 179 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 180 _mm_cmpeq_ss(__m128 a, __m128 b) 181 { 182 return (__m128)__builtin_ia32_cmpss(a, b, 0); 183 } 184 185 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 186 _mm_cmpeq_ps(__m128 a, __m128 b) 187 { 188 return (__m128)__builtin_ia32_cmpps(a, b, 0); 189 } 190 191 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 192 _mm_cmplt_ss(__m128 a, __m128 b) 193 { 194 return (__m128)__builtin_ia32_cmpss(a, b, 1); 195 } 196 197 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 198 _mm_cmplt_ps(__m128 a, __m128 b) 199 { 200 return (__m128)__builtin_ia32_cmpps(a, b, 1); 201 } 202 203 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 204 _mm_cmple_ss(__m128 a, __m128 b) 205 { 206 return (__m128)__builtin_ia32_cmpss(a, b, 2); 207 } 208 209 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 210 _mm_cmple_ps(__m128 a, __m128 b) 211 { 212 return (__m128)__builtin_ia32_cmpps(a, b, 2); 213 } 214 215 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 216 _mm_cmpgt_ss(__m128 a, __m128 b) 217 { 218 return (__m128)__builtin_ia32_cmpss(b, a, 1); 219 } 220 221 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 222 _mm_cmpgt_ps(__m128 a, __m128 b) 223 { 224 return (__m128)__builtin_ia32_cmpps(b, a, 1); 225 } 226 227 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 228 _mm_cmpge_ss(__m128 a, __m128 b) 229 { 230 return (__m128)__builtin_ia32_cmpss(b, a, 2); 231 } 232 233 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 234 _mm_cmpge_ps(__m128 a, __m128 b) 235 { 236 return (__m128)__builtin_ia32_cmpps(b, a, 2); 237 } 238 239 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 240 _mm_cmpneq_ss(__m128 a, __m128 b) 241 { 242 return (__m128)__builtin_ia32_cmpss(a, b, 4); 243 } 244 245 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 246 _mm_cmpneq_ps(__m128 a, __m128 b) 247 { 248 return (__m128)__builtin_ia32_cmpps(a, b, 4); 249 } 250 251 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 252 _mm_cmpnlt_ss(__m128 a, __m128 b) 253 { 254 return (__m128)__builtin_ia32_cmpss(a, b, 5); 255 } 256 257 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 258 _mm_cmpnlt_ps(__m128 a, __m128 b) 259 { 260 return (__m128)__builtin_ia32_cmpps(a, b, 5); 261 } 262 263 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 264 _mm_cmpnle_ss(__m128 a, __m128 b) 265 { 266 return (__m128)__builtin_ia32_cmpss(a, b, 6); 267 } 268 269 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 270 _mm_cmpnle_ps(__m128 a, __m128 b) 271 { 272 return (__m128)__builtin_ia32_cmpps(a, b, 6); 273 } 274 275 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 276 _mm_cmpngt_ss(__m128 a, __m128 b) 277 { 278 return (__m128)__builtin_ia32_cmpss(b, a, 5); 279 } 280 281 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 282 _mm_cmpngt_ps(__m128 a, __m128 b) 283 { 284 return (__m128)__builtin_ia32_cmpps(b, a, 5); 285 } 286 287 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 288 _mm_cmpnge_ss(__m128 a, __m128 b) 289 { 290 return (__m128)__builtin_ia32_cmpss(b, a, 6); 291 } 292 293 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 294 _mm_cmpnge_ps(__m128 a, __m128 b) 295 { 296 return (__m128)__builtin_ia32_cmpps(b, a, 6); 297 } 298 299 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 300 _mm_cmpord_ss(__m128 a, __m128 b) 301 { 302 return (__m128)__builtin_ia32_cmpss(a, b, 7); 303 } 304 305 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 306 _mm_cmpord_ps(__m128 a, __m128 b) 307 { 308 return (__m128)__builtin_ia32_cmpps(a, b, 7); 309 } 310 311 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 312 _mm_cmpunord_ss(__m128 a, __m128 b) 313 { 314 return (__m128)__builtin_ia32_cmpss(a, b, 3); 315 } 316 317 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 318 _mm_cmpunord_ps(__m128 a, __m128 b) 319 { 320 return (__m128)__builtin_ia32_cmpps(a, b, 3); 321 } 322 323 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 324 _mm_comieq_ss(__m128 a, __m128 b) 325 { 326 return __builtin_ia32_comieq(a, b); 327 } 328 329 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 330 _mm_comilt_ss(__m128 a, __m128 b) 331 { 332 return __builtin_ia32_comilt(a, b); 333 } 334 335 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 336 _mm_comile_ss(__m128 a, __m128 b) 337 { 338 return __builtin_ia32_comile(a, b); 339 } 340 341 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 342 _mm_comigt_ss(__m128 a, __m128 b) 343 { 344 return __builtin_ia32_comigt(a, b); 345 } 346 347 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 348 _mm_comige_ss(__m128 a, __m128 b) 349 { 350 return __builtin_ia32_comige(a, b); 351 } 352 353 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 354 _mm_comineq_ss(__m128 a, __m128 b) 355 { 356 return __builtin_ia32_comineq(a, b); 357 } 358 359 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 360 _mm_ucomieq_ss(__m128 a, __m128 b) 361 { 362 return __builtin_ia32_ucomieq(a, b); 363 } 364 365 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 366 _mm_ucomilt_ss(__m128 a, __m128 b) 367 { 368 return __builtin_ia32_ucomilt(a, b); 369 } 370 371 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 372 _mm_ucomile_ss(__m128 a, __m128 b) 373 { 374 return __builtin_ia32_ucomile(a, b); 375 } 376 377 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 378 _mm_ucomigt_ss(__m128 a, __m128 b) 379 { 380 return __builtin_ia32_ucomigt(a, b); 381 } 382 383 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 384 _mm_ucomige_ss(__m128 a, __m128 b) 385 { 386 return __builtin_ia32_ucomige(a, b); 387 } 388 389 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 390 _mm_ucomineq_ss(__m128 a, __m128 b) 391 { 392 return __builtin_ia32_ucomineq(a, b); 393 } 394 395 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 396 _mm_cvtss_si32(__m128 a) 397 { 398 return __builtin_ia32_cvtss2si(a); 399 } 400 401 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 402 _mm_cvt_ss2si(__m128 a) 403 { 404 return _mm_cvtss_si32(a); 405 } 406 407 #ifdef __x86_64__ 408 409 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 410 _mm_cvtss_si64(__m128 a) 411 { 412 return __builtin_ia32_cvtss2si64(a); 413 } 414 415 #endif 416 417 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 418 _mm_cvtps_pi32(__m128 a) 419 { 420 return (__m64)__builtin_ia32_cvtps2pi(a); 421 } 422 423 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 424 _mm_cvt_ps2pi(__m128 a) 425 { 426 return _mm_cvtps_pi32(a); 427 } 428 429 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 430 _mm_cvttss_si32(__m128 a) 431 { 432 return a[0]; 433 } 434 435 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 436 _mm_cvtt_ss2si(__m128 a) 437 { 438 return _mm_cvttss_si32(a); 439 } 440 441 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 442 _mm_cvttss_si64(__m128 a) 443 { 444 return a[0]; 445 } 446 447 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 448 _mm_cvttps_pi32(__m128 a) 449 { 450 return (__m64)__builtin_ia32_cvttps2pi(a); 451 } 452 453 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 454 _mm_cvtt_ps2pi(__m128 a) 455 { 456 return _mm_cvttps_pi32(a); 457 } 458 459 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 460 _mm_cvtsi32_ss(__m128 a, int b) 461 { 462 a[0] = b; 463 return a; 464 } 465 466 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 467 _mm_cvt_si2ss(__m128 a, int b) 468 { 469 return _mm_cvtsi32_ss(a, b); 470 } 471 472 #ifdef __x86_64__ 473 474 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 475 _mm_cvtsi64_ss(__m128 a, long long b) 476 { 477 a[0] = b; 478 return a; 479 } 480 481 #endif 482 483 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 484 _mm_cvtpi32_ps(__m128 a, __m64 b) 485 { 486 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 487 } 488 489 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 490 _mm_cvt_pi2ps(__m128 a, __m64 b) 491 { 492 return _mm_cvtpi32_ps(a, b); 493 } 494 495 static __inline__ float __attribute__((__always_inline__, __nodebug__)) 496 _mm_cvtss_f32(__m128 a) 497 { 498 return a[0]; 499 } 500 501 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 502 _mm_loadh_pi(__m128 a, const __m64 *p) 503 { 504 __m128 b; 505 b[0] = *(float*)p; 506 b[1] = *((float*)p+1); 507 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 508 } 509 510 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 511 _mm_loadl_pi(__m128 a, const __m64 *p) 512 { 513 __m128 b; 514 b[0] = *(float*)p; 515 b[1] = *((float*)p+1); 516 return __builtin_shufflevector(a, b, 4, 5, 2, 3); 517 } 518 519 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 520 _mm_load_ss(const float *p) 521 { 522 return (__m128){ *p, 0, 0, 0 }; 523 } 524 525 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 526 _mm_load1_ps(const float *p) 527 { 528 return (__m128){ *p, *p, *p, *p }; 529 } 530 531 #define _mm_load_ps1(p) _mm_load1_ps(p) 532 533 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 534 _mm_load_ps(const float *p) 535 { 536 return *(__m128*)p; 537 } 538 539 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 540 _mm_loadu_ps(const float *p) 541 { 542 struct __loadu_ps { 543 __m128 v; 544 } __attribute__((packed, may_alias)); 545 return ((struct __loadu_ps*)p)->v; 546 } 547 548 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 549 _mm_loadr_ps(const float *p) 550 { 551 __m128 a = _mm_load_ps(p); 552 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 553 } 554 555 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 556 _mm_set_ss(float w) 557 { 558 return (__m128){ w, 0, 0, 0 }; 559 } 560 561 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 562 _mm_set1_ps(float w) 563 { 564 return (__m128){ w, w, w, w }; 565 } 566 567 // Microsoft specific. 568 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 569 _mm_set_ps1(float w) 570 { 571 return _mm_set1_ps(w); 572 } 573 574 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 575 _mm_set_ps(float z, float y, float x, float w) 576 { 577 return (__m128){ w, x, y, z }; 578 } 579 580 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 581 _mm_setr_ps(float z, float y, float x, float w) 582 { 583 return (__m128){ z, y, x, w }; 584 } 585 586 static __inline__ __m128 __attribute__((__always_inline__)) 587 _mm_setzero_ps(void) 588 { 589 return (__m128){ 0, 0, 0, 0 }; 590 } 591 592 static __inline__ void __attribute__((__always_inline__)) 593 _mm_storeh_pi(__m64 *p, __m128 a) 594 { 595 __builtin_ia32_storehps((__v2si *)p, a); 596 } 597 598 static __inline__ void __attribute__((__always_inline__)) 599 _mm_storel_pi(__m64 *p, __m128 a) 600 { 601 __builtin_ia32_storelps((__v2si *)p, a); 602 } 603 604 static __inline__ void __attribute__((__always_inline__)) 605 _mm_store_ss(float *p, __m128 a) 606 { 607 *p = a[0]; 608 } 609 610 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 611 _mm_storeu_ps(float *p, __m128 a) 612 { 613 __builtin_ia32_storeups(p, a); 614 } 615 616 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 617 _mm_store1_ps(float *p, __m128 a) 618 { 619 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 620 _mm_storeu_ps(p, a); 621 } 622 623 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 624 _mm_store_ps1(float *p, __m128 a) 625 { 626 return _mm_store1_ps(p, a); 627 } 628 629 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 630 _mm_store_ps(float *p, __m128 a) 631 { 632 *(__m128 *)p = a; 633 } 634 635 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 636 _mm_storer_ps(float *p, __m128 a) 637 { 638 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 639 _mm_store_ps(p, a); 640 } 641 642 #define _MM_HINT_T0 3 643 #define _MM_HINT_T1 2 644 #define _MM_HINT_T2 1 645 #define _MM_HINT_NTA 0 646 647 /* FIXME: We have to #define this because "sel" must be a constant integer, and 648 Sema doesn't do any form of constant propagation yet. */ 649 650 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, sel)) 651 652 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 653 _mm_stream_pi(__m64 *p, __m64 a) 654 { 655 __builtin_ia32_movntq(p, a); 656 } 657 658 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 659 _mm_stream_ps(float *p, __m128 a) 660 { 661 __builtin_ia32_movntps(p, a); 662 } 663 664 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 665 _mm_sfence(void) 666 { 667 __builtin_ia32_sfence(); 668 } 669 670 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 671 _mm_extract_pi16(__m64 a, int n) 672 { 673 __v4hi b = (__v4hi)a; 674 return (unsigned short)b[n & 3]; 675 } 676 677 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 678 _mm_insert_pi16(__m64 a, int d, int n) 679 { 680 __v4hi b = (__v4hi)a; 681 b[n & 3] = d; 682 return (__m64)b; 683 } 684 685 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 686 _mm_max_pi16(__m64 a, __m64 b) 687 { 688 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 689 } 690 691 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 692 _mm_max_pu8(__m64 a, __m64 b) 693 { 694 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 695 } 696 697 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 698 _mm_min_pi16(__m64 a, __m64 b) 699 { 700 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 701 } 702 703 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 704 _mm_min_pu8(__m64 a, __m64 b) 705 { 706 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 707 } 708 709 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 710 _mm_movemask_pi8(__m64 a) 711 { 712 return __builtin_ia32_pmovmskb((__v8qi)a); 713 } 714 715 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 716 _mm_mulhi_pu16(__m64 a, __m64 b) 717 { 718 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 719 } 720 721 #define _mm_shuffle_pi16(a, n) \ 722 ((__m64)__builtin_ia32_pshufw(a, n)) 723 724 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 725 _mm_maskmove_si64(__m64 d, __m64 n, char *p) 726 { 727 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 728 } 729 730 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 731 _mm_avg_pu8(__m64 a, __m64 b) 732 { 733 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 734 } 735 736 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 737 _mm_avg_pu16(__m64 a, __m64 b) 738 { 739 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 740 } 741 742 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 743 _mm_sad_pu8(__m64 a, __m64 b) 744 { 745 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 746 } 747 748 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 749 _mm_getcsr(void) 750 { 751 return __builtin_ia32_stmxcsr(); 752 } 753 754 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 755 _mm_setcsr(unsigned int i) 756 { 757 __builtin_ia32_ldmxcsr(i); 758 } 759 760 #define _mm_shuffle_ps(a, b, mask) \ 761 (__builtin_shufflevector((__v4sf)(a), (__v4sf)(b), \ 762 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 763 (((mask) & 0x30) >> 4) + 4, \ 764 (((mask) & 0xc0) >> 6) + 4)) 765 766 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 767 _mm_unpackhi_ps(__m128 a, __m128 b) 768 { 769 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 770 } 771 772 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 773 _mm_unpacklo_ps(__m128 a, __m128 b) 774 { 775 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 776 } 777 778 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 779 _mm_move_ss(__m128 a, __m128 b) 780 { 781 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 782 } 783 784 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 785 _mm_movehl_ps(__m128 a, __m128 b) 786 { 787 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 788 } 789 790 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 791 _mm_movelh_ps(__m128 a, __m128 b) 792 { 793 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 794 } 795 796 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 797 _mm_cvtpi16_ps(__m64 a) 798 { 799 __m64 b, c; 800 __m128 r; 801 802 b = _mm_setzero_si64(); 803 b = _mm_cmpgt_pi16(b, a); 804 c = _mm_unpackhi_pi16(a, b); 805 r = _mm_setzero_ps(); 806 r = _mm_cvtpi32_ps(r, c); 807 r = _mm_movelh_ps(r, r); 808 c = _mm_unpacklo_pi16(a, b); 809 r = _mm_cvtpi32_ps(r, c); 810 811 return r; 812 } 813 814 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 815 _mm_cvtpu16_ps(__m64 a) 816 { 817 __m64 b, c; 818 __m128 r; 819 820 b = _mm_setzero_si64(); 821 c = _mm_unpackhi_pi16(a, b); 822 r = _mm_setzero_ps(); 823 r = _mm_cvtpi32_ps(r, c); 824 r = _mm_movelh_ps(r, r); 825 c = _mm_unpacklo_pi16(a, b); 826 r = _mm_cvtpi32_ps(r, c); 827 828 return r; 829 } 830 831 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 832 _mm_cvtpi8_ps(__m64 a) 833 { 834 __m64 b; 835 836 b = _mm_setzero_si64(); 837 b = _mm_cmpgt_pi8(b, a); 838 b = _mm_unpacklo_pi8(a, b); 839 840 return _mm_cvtpi16_ps(b); 841 } 842 843 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 844 _mm_cvtpu8_ps(__m64 a) 845 { 846 __m64 b; 847 848 b = _mm_setzero_si64(); 849 b = _mm_unpacklo_pi8(a, b); 850 851 return _mm_cvtpi16_ps(b); 852 } 853 854 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 855 _mm_cvtpi32x2_ps(__m64 a, __m64 b) 856 { 857 __m128 c; 858 859 c = _mm_setzero_ps(); 860 c = _mm_cvtpi32_ps(c, b); 861 c = _mm_movelh_ps(c, c); 862 863 return _mm_cvtpi32_ps(c, a); 864 } 865 866 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 867 _mm_cvtps_pi16(__m128 a) 868 { 869 __m64 b, c; 870 871 b = _mm_cvtps_pi32(a); 872 a = _mm_movehl_ps(a, a); 873 c = _mm_cvtps_pi32(a); 874 875 return _mm_packs_pi16(b, c); 876 } 877 878 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 879 _mm_cvtps_pi8(__m128 a) 880 { 881 __m64 b, c; 882 883 b = _mm_cvtps_pi16(a); 884 c = _mm_setzero_si64(); 885 886 return _mm_packs_pi16(b, c); 887 } 888 889 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 890 _mm_movemask_ps(__m128 a) 891 { 892 return __builtin_ia32_movmskps(a); 893 } 894 895 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 896 897 #define _MM_EXCEPT_INVALID (0x0001) 898 #define _MM_EXCEPT_DENORM (0x0002) 899 #define _MM_EXCEPT_DIV_ZERO (0x0004) 900 #define _MM_EXCEPT_OVERFLOW (0x0008) 901 #define _MM_EXCEPT_UNDERFLOW (0x0010) 902 #define _MM_EXCEPT_INEXACT (0x0020) 903 #define _MM_EXCEPT_MASK (0x003f) 904 905 #define _MM_MASK_INVALID (0x0080) 906 #define _MM_MASK_DENORM (0x0100) 907 #define _MM_MASK_DIV_ZERO (0x0200) 908 #define _MM_MASK_OVERFLOW (0x0400) 909 #define _MM_MASK_UNDERFLOW (0x0800) 910 #define _MM_MASK_INEXACT (0x1000) 911 #define _MM_MASK_MASK (0x1f80) 912 913 #define _MM_ROUND_NEAREST (0x0000) 914 #define _MM_ROUND_DOWN (0x2000) 915 #define _MM_ROUND_UP (0x4000) 916 #define _MM_ROUND_TOWARD_ZERO (0x6000) 917 #define _MM_ROUND_MASK (0x6000) 918 919 #define _MM_FLUSH_ZERO_MASK (0x8000) 920 #define _MM_FLUSH_ZERO_ON (0x8000) 921 #define _MM_FLUSH_ZERO_OFF (0x8000) 922 923 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 924 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 925 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 926 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 927 928 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 929 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 930 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 931 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 932 933 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 934 do { \ 935 __m128 tmp3, tmp2, tmp1, tmp0; \ 936 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 937 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 938 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 939 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 940 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 941 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 942 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 943 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 944 } while (0) 945 946 /* Aliases for compatibility. */ 947 #define _m_pextrw _mm_extract_pi16 948 #define _m_pinsrw _mm_insert_pi16 949 #define _m_pmaxsw _mm_max_pi16 950 #define _m_pmaxub _mm_max_pu8 951 #define _m_pminsw _mm_min_pi16 952 #define _m_pminub _mm_min_pu8 953 #define _m_pmovmskb _mm_movemask_pi8 954 #define _m_pmulhuw _mm_mulhi_pu16 955 #define _m_pshufw _mm_shuffle_pi16 956 #define _m_maskmovq _mm_maskmove_si64 957 #define _m_pavgb _mm_avg_pu8 958 #define _m_pavgw _mm_avg_pu16 959 #define _m_psadbw _mm_sad_pu8 960 #define _m_ _mm_ 961 #define _m_ _mm_ 962 963 /* Ugly hack for backwards-compatibility (compatible with gcc) */ 964 #ifdef __SSE2__ 965 #include <emmintrin.h> 966 #endif 967 968 #endif /* __SSE__ */ 969 970 #endif /* __XMMINTRIN_H */ 971