1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __XMMINTRIN_H 25 #define __XMMINTRIN_H 26 27 #include <mmintrin.h> 28 29 typedef int __v4si __attribute__((__vector_size__(16))); 30 typedef float __v4sf __attribute__((__vector_size__(16))); 31 typedef float __m128 __attribute__((__vector_size__(16))); 32 33 /* This header should only be included in a hosted environment as it depends on 34 * a standard library to provide allocation routines. */ 35 #if __STDC_HOSTED__ 36 #include <mm_malloc.h> 37 #endif 38 39 /* Define the default attributes for the functions in this file. */ 40 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"))) 41 42 static __inline__ __m128 __DEFAULT_FN_ATTRS 43 _mm_add_ss(__m128 __a, __m128 __b) 44 { 45 __a[0] += __b[0]; 46 return __a; 47 } 48 49 static __inline__ __m128 __DEFAULT_FN_ATTRS 50 _mm_add_ps(__m128 __a, __m128 __b) 51 { 52 return __a + __b; 53 } 54 55 static __inline__ __m128 __DEFAULT_FN_ATTRS 56 _mm_sub_ss(__m128 __a, __m128 __b) 57 { 58 __a[0] -= __b[0]; 59 return __a; 60 } 61 62 static __inline__ __m128 __DEFAULT_FN_ATTRS 63 _mm_sub_ps(__m128 __a, __m128 __b) 64 { 65 return __a - __b; 66 } 67 68 static __inline__ __m128 __DEFAULT_FN_ATTRS 69 _mm_mul_ss(__m128 __a, __m128 __b) 70 { 71 __a[0] *= __b[0]; 72 return __a; 73 } 74 75 static __inline__ __m128 __DEFAULT_FN_ATTRS 76 _mm_mul_ps(__m128 __a, __m128 __b) 77 { 78 return __a * __b; 79 } 80 81 static __inline__ __m128 __DEFAULT_FN_ATTRS 82 _mm_div_ss(__m128 __a, __m128 __b) 83 { 84 __a[0] /= __b[0]; 85 return __a; 86 } 87 88 static __inline__ __m128 __DEFAULT_FN_ATTRS 89 _mm_div_ps(__m128 __a, __m128 __b) 90 { 91 return __a / __b; 92 } 93 94 static __inline__ __m128 __DEFAULT_FN_ATTRS 95 _mm_sqrt_ss(__m128 __a) 96 { 97 __m128 __c = __builtin_ia32_sqrtss(__a); 98 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 99 } 100 101 static __inline__ __m128 __DEFAULT_FN_ATTRS 102 _mm_sqrt_ps(__m128 __a) 103 { 104 return __builtin_ia32_sqrtps(__a); 105 } 106 107 static __inline__ __m128 __DEFAULT_FN_ATTRS 108 _mm_rcp_ss(__m128 __a) 109 { 110 __m128 __c = __builtin_ia32_rcpss(__a); 111 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 112 } 113 114 static __inline__ __m128 __DEFAULT_FN_ATTRS 115 _mm_rcp_ps(__m128 __a) 116 { 117 return __builtin_ia32_rcpps(__a); 118 } 119 120 static __inline__ __m128 __DEFAULT_FN_ATTRS 121 _mm_rsqrt_ss(__m128 __a) 122 { 123 __m128 __c = __builtin_ia32_rsqrtss(__a); 124 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 125 } 126 127 static __inline__ __m128 __DEFAULT_FN_ATTRS 128 _mm_rsqrt_ps(__m128 __a) 129 { 130 return __builtin_ia32_rsqrtps(__a); 131 } 132 133 static __inline__ __m128 __DEFAULT_FN_ATTRS 134 _mm_min_ss(__m128 __a, __m128 __b) 135 { 136 return __builtin_ia32_minss(__a, __b); 137 } 138 139 static __inline__ __m128 __DEFAULT_FN_ATTRS 140 _mm_min_ps(__m128 __a, __m128 __b) 141 { 142 return __builtin_ia32_minps(__a, __b); 143 } 144 145 static __inline__ __m128 __DEFAULT_FN_ATTRS 146 _mm_max_ss(__m128 __a, __m128 __b) 147 { 148 return __builtin_ia32_maxss(__a, __b); 149 } 150 151 static __inline__ __m128 __DEFAULT_FN_ATTRS 152 _mm_max_ps(__m128 __a, __m128 __b) 153 { 154 return __builtin_ia32_maxps(__a, __b); 155 } 156 157 static __inline__ __m128 __DEFAULT_FN_ATTRS 158 _mm_and_ps(__m128 __a, __m128 __b) 159 { 160 return (__m128)((__v4si)__a & (__v4si)__b); 161 } 162 163 static __inline__ __m128 __DEFAULT_FN_ATTRS 164 _mm_andnot_ps(__m128 __a, __m128 __b) 165 { 166 return (__m128)(~(__v4si)__a & (__v4si)__b); 167 } 168 169 static __inline__ __m128 __DEFAULT_FN_ATTRS 170 _mm_or_ps(__m128 __a, __m128 __b) 171 { 172 return (__m128)((__v4si)__a | (__v4si)__b); 173 } 174 175 static __inline__ __m128 __DEFAULT_FN_ATTRS 176 _mm_xor_ps(__m128 __a, __m128 __b) 177 { 178 return (__m128)((__v4si)__a ^ (__v4si)__b); 179 } 180 181 static __inline__ __m128 __DEFAULT_FN_ATTRS 182 _mm_cmpeq_ss(__m128 __a, __m128 __b) 183 { 184 return (__m128)__builtin_ia32_cmpeqss(__a, __b); 185 } 186 187 static __inline__ __m128 __DEFAULT_FN_ATTRS 188 _mm_cmpeq_ps(__m128 __a, __m128 __b) 189 { 190 return (__m128)__builtin_ia32_cmpeqps(__a, __b); 191 } 192 193 static __inline__ __m128 __DEFAULT_FN_ATTRS 194 _mm_cmplt_ss(__m128 __a, __m128 __b) 195 { 196 return (__m128)__builtin_ia32_cmpltss(__a, __b); 197 } 198 199 static __inline__ __m128 __DEFAULT_FN_ATTRS 200 _mm_cmplt_ps(__m128 __a, __m128 __b) 201 { 202 return (__m128)__builtin_ia32_cmpltps(__a, __b); 203 } 204 205 static __inline__ __m128 __DEFAULT_FN_ATTRS 206 _mm_cmple_ss(__m128 __a, __m128 __b) 207 { 208 return (__m128)__builtin_ia32_cmpless(__a, __b); 209 } 210 211 static __inline__ __m128 __DEFAULT_FN_ATTRS 212 _mm_cmple_ps(__m128 __a, __m128 __b) 213 { 214 return (__m128)__builtin_ia32_cmpleps(__a, __b); 215 } 216 217 static __inline__ __m128 __DEFAULT_FN_ATTRS 218 _mm_cmpgt_ss(__m128 __a, __m128 __b) 219 { 220 return (__m128)__builtin_shufflevector(__a, 221 __builtin_ia32_cmpltss(__b, __a), 222 4, 1, 2, 3); 223 } 224 225 static __inline__ __m128 __DEFAULT_FN_ATTRS 226 _mm_cmpgt_ps(__m128 __a, __m128 __b) 227 { 228 return (__m128)__builtin_ia32_cmpltps(__b, __a); 229 } 230 231 static __inline__ __m128 __DEFAULT_FN_ATTRS 232 _mm_cmpge_ss(__m128 __a, __m128 __b) 233 { 234 return (__m128)__builtin_shufflevector(__a, 235 __builtin_ia32_cmpless(__b, __a), 236 4, 1, 2, 3); 237 } 238 239 static __inline__ __m128 __DEFAULT_FN_ATTRS 240 _mm_cmpge_ps(__m128 __a, __m128 __b) 241 { 242 return (__m128)__builtin_ia32_cmpleps(__b, __a); 243 } 244 245 static __inline__ __m128 __DEFAULT_FN_ATTRS 246 _mm_cmpneq_ss(__m128 __a, __m128 __b) 247 { 248 return (__m128)__builtin_ia32_cmpneqss(__a, __b); 249 } 250 251 static __inline__ __m128 __DEFAULT_FN_ATTRS 252 _mm_cmpneq_ps(__m128 __a, __m128 __b) 253 { 254 return (__m128)__builtin_ia32_cmpneqps(__a, __b); 255 } 256 257 static __inline__ __m128 __DEFAULT_FN_ATTRS 258 _mm_cmpnlt_ss(__m128 __a, __m128 __b) 259 { 260 return (__m128)__builtin_ia32_cmpnltss(__a, __b); 261 } 262 263 static __inline__ __m128 __DEFAULT_FN_ATTRS 264 _mm_cmpnlt_ps(__m128 __a, __m128 __b) 265 { 266 return (__m128)__builtin_ia32_cmpnltps(__a, __b); 267 } 268 269 static __inline__ __m128 __DEFAULT_FN_ATTRS 270 _mm_cmpnle_ss(__m128 __a, __m128 __b) 271 { 272 return (__m128)__builtin_ia32_cmpnless(__a, __b); 273 } 274 275 static __inline__ __m128 __DEFAULT_FN_ATTRS 276 _mm_cmpnle_ps(__m128 __a, __m128 __b) 277 { 278 return (__m128)__builtin_ia32_cmpnleps(__a, __b); 279 } 280 281 static __inline__ __m128 __DEFAULT_FN_ATTRS 282 _mm_cmpngt_ss(__m128 __a, __m128 __b) 283 { 284 return (__m128)__builtin_shufflevector(__a, 285 __builtin_ia32_cmpnltss(__b, __a), 286 4, 1, 2, 3); 287 } 288 289 static __inline__ __m128 __DEFAULT_FN_ATTRS 290 _mm_cmpngt_ps(__m128 __a, __m128 __b) 291 { 292 return (__m128)__builtin_ia32_cmpnltps(__b, __a); 293 } 294 295 static __inline__ __m128 __DEFAULT_FN_ATTRS 296 _mm_cmpnge_ss(__m128 __a, __m128 __b) 297 { 298 return (__m128)__builtin_shufflevector(__a, 299 __builtin_ia32_cmpnless(__b, __a), 300 4, 1, 2, 3); 301 } 302 303 static __inline__ __m128 __DEFAULT_FN_ATTRS 304 _mm_cmpnge_ps(__m128 __a, __m128 __b) 305 { 306 return (__m128)__builtin_ia32_cmpnleps(__b, __a); 307 } 308 309 static __inline__ __m128 __DEFAULT_FN_ATTRS 310 _mm_cmpord_ss(__m128 __a, __m128 __b) 311 { 312 return (__m128)__builtin_ia32_cmpordss(__a, __b); 313 } 314 315 static __inline__ __m128 __DEFAULT_FN_ATTRS 316 _mm_cmpord_ps(__m128 __a, __m128 __b) 317 { 318 return (__m128)__builtin_ia32_cmpordps(__a, __b); 319 } 320 321 static __inline__ __m128 __DEFAULT_FN_ATTRS 322 _mm_cmpunord_ss(__m128 __a, __m128 __b) 323 { 324 return (__m128)__builtin_ia32_cmpunordss(__a, __b); 325 } 326 327 static __inline__ __m128 __DEFAULT_FN_ATTRS 328 _mm_cmpunord_ps(__m128 __a, __m128 __b) 329 { 330 return (__m128)__builtin_ia32_cmpunordps(__a, __b); 331 } 332 333 static __inline__ int __DEFAULT_FN_ATTRS 334 _mm_comieq_ss(__m128 __a, __m128 __b) 335 { 336 return __builtin_ia32_comieq(__a, __b); 337 } 338 339 static __inline__ int __DEFAULT_FN_ATTRS 340 _mm_comilt_ss(__m128 __a, __m128 __b) 341 { 342 return __builtin_ia32_comilt(__a, __b); 343 } 344 345 static __inline__ int __DEFAULT_FN_ATTRS 346 _mm_comile_ss(__m128 __a, __m128 __b) 347 { 348 return __builtin_ia32_comile(__a, __b); 349 } 350 351 static __inline__ int __DEFAULT_FN_ATTRS 352 _mm_comigt_ss(__m128 __a, __m128 __b) 353 { 354 return __builtin_ia32_comigt(__a, __b); 355 } 356 357 static __inline__ int __DEFAULT_FN_ATTRS 358 _mm_comige_ss(__m128 __a, __m128 __b) 359 { 360 return __builtin_ia32_comige(__a, __b); 361 } 362 363 static __inline__ int __DEFAULT_FN_ATTRS 364 _mm_comineq_ss(__m128 __a, __m128 __b) 365 { 366 return __builtin_ia32_comineq(__a, __b); 367 } 368 369 static __inline__ int __DEFAULT_FN_ATTRS 370 _mm_ucomieq_ss(__m128 __a, __m128 __b) 371 { 372 return __builtin_ia32_ucomieq(__a, __b); 373 } 374 375 static __inline__ int __DEFAULT_FN_ATTRS 376 _mm_ucomilt_ss(__m128 __a, __m128 __b) 377 { 378 return __builtin_ia32_ucomilt(__a, __b); 379 } 380 381 static __inline__ int __DEFAULT_FN_ATTRS 382 _mm_ucomile_ss(__m128 __a, __m128 __b) 383 { 384 return __builtin_ia32_ucomile(__a, __b); 385 } 386 387 static __inline__ int __DEFAULT_FN_ATTRS 388 _mm_ucomigt_ss(__m128 __a, __m128 __b) 389 { 390 return __builtin_ia32_ucomigt(__a, __b); 391 } 392 393 static __inline__ int __DEFAULT_FN_ATTRS 394 _mm_ucomige_ss(__m128 __a, __m128 __b) 395 { 396 return __builtin_ia32_ucomige(__a, __b); 397 } 398 399 static __inline__ int __DEFAULT_FN_ATTRS 400 _mm_ucomineq_ss(__m128 __a, __m128 __b) 401 { 402 return __builtin_ia32_ucomineq(__a, __b); 403 } 404 405 static __inline__ int __DEFAULT_FN_ATTRS 406 _mm_cvtss_si32(__m128 __a) 407 { 408 return __builtin_ia32_cvtss2si(__a); 409 } 410 411 static __inline__ int __DEFAULT_FN_ATTRS 412 _mm_cvt_ss2si(__m128 __a) 413 { 414 return _mm_cvtss_si32(__a); 415 } 416 417 #ifdef __x86_64__ 418 419 static __inline__ long long __DEFAULT_FN_ATTRS 420 _mm_cvtss_si64(__m128 __a) 421 { 422 return __builtin_ia32_cvtss2si64(__a); 423 } 424 425 #endif 426 427 static __inline__ __m64 __DEFAULT_FN_ATTRS 428 _mm_cvtps_pi32(__m128 __a) 429 { 430 return (__m64)__builtin_ia32_cvtps2pi(__a); 431 } 432 433 static __inline__ __m64 __DEFAULT_FN_ATTRS 434 _mm_cvt_ps2pi(__m128 __a) 435 { 436 return _mm_cvtps_pi32(__a); 437 } 438 439 static __inline__ int __DEFAULT_FN_ATTRS 440 _mm_cvttss_si32(__m128 __a) 441 { 442 return __a[0]; 443 } 444 445 static __inline__ int __DEFAULT_FN_ATTRS 446 _mm_cvtt_ss2si(__m128 __a) 447 { 448 return _mm_cvttss_si32(__a); 449 } 450 451 static __inline__ long long __DEFAULT_FN_ATTRS 452 _mm_cvttss_si64(__m128 __a) 453 { 454 return __a[0]; 455 } 456 457 static __inline__ __m64 __DEFAULT_FN_ATTRS 458 _mm_cvttps_pi32(__m128 __a) 459 { 460 return (__m64)__builtin_ia32_cvttps2pi(__a); 461 } 462 463 static __inline__ __m64 __DEFAULT_FN_ATTRS 464 _mm_cvtt_ps2pi(__m128 __a) 465 { 466 return _mm_cvttps_pi32(__a); 467 } 468 469 static __inline__ __m128 __DEFAULT_FN_ATTRS 470 _mm_cvtsi32_ss(__m128 __a, int __b) 471 { 472 __a[0] = __b; 473 return __a; 474 } 475 476 static __inline__ __m128 __DEFAULT_FN_ATTRS 477 _mm_cvt_si2ss(__m128 __a, int __b) 478 { 479 return _mm_cvtsi32_ss(__a, __b); 480 } 481 482 #ifdef __x86_64__ 483 484 static __inline__ __m128 __DEFAULT_FN_ATTRS 485 _mm_cvtsi64_ss(__m128 __a, long long __b) 486 { 487 __a[0] = __b; 488 return __a; 489 } 490 491 #endif 492 493 static __inline__ __m128 __DEFAULT_FN_ATTRS 494 _mm_cvtpi32_ps(__m128 __a, __m64 __b) 495 { 496 return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b); 497 } 498 499 static __inline__ __m128 __DEFAULT_FN_ATTRS 500 _mm_cvt_pi2ps(__m128 __a, __m64 __b) 501 { 502 return _mm_cvtpi32_ps(__a, __b); 503 } 504 505 static __inline__ float __DEFAULT_FN_ATTRS 506 _mm_cvtss_f32(__m128 __a) 507 { 508 return __a[0]; 509 } 510 511 static __inline__ __m128 __DEFAULT_FN_ATTRS 512 _mm_loadh_pi(__m128 __a, const __m64 *__p) 513 { 514 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 515 struct __mm_loadh_pi_struct { 516 __mm_loadh_pi_v2f32 __u; 517 } __attribute__((__packed__, __may_alias__)); 518 __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; 519 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 520 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 521 } 522 523 static __inline__ __m128 __DEFAULT_FN_ATTRS 524 _mm_loadl_pi(__m128 __a, const __m64 *__p) 525 { 526 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 527 struct __mm_loadl_pi_struct { 528 __mm_loadl_pi_v2f32 __u; 529 } __attribute__((__packed__, __may_alias__)); 530 __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; 531 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 532 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 533 } 534 535 static __inline__ __m128 __DEFAULT_FN_ATTRS 536 _mm_load_ss(const float *__p) 537 { 538 struct __mm_load_ss_struct { 539 float __u; 540 } __attribute__((__packed__, __may_alias__)); 541 float __u = ((struct __mm_load_ss_struct*)__p)->__u; 542 return (__m128){ __u, 0, 0, 0 }; 543 } 544 545 static __inline__ __m128 __DEFAULT_FN_ATTRS 546 _mm_load1_ps(const float *__p) 547 { 548 struct __mm_load1_ps_struct { 549 float __u; 550 } __attribute__((__packed__, __may_alias__)); 551 float __u = ((struct __mm_load1_ps_struct*)__p)->__u; 552 return (__m128){ __u, __u, __u, __u }; 553 } 554 555 #define _mm_load_ps1(p) _mm_load1_ps(p) 556 557 static __inline__ __m128 __DEFAULT_FN_ATTRS 558 _mm_load_ps(const float *__p) 559 { 560 return *(__m128*)__p; 561 } 562 563 static __inline__ __m128 __DEFAULT_FN_ATTRS 564 _mm_loadu_ps(const float *__p) 565 { 566 struct __loadu_ps { 567 __m128 __v; 568 } __attribute__((__packed__, __may_alias__)); 569 return ((struct __loadu_ps*)__p)->__v; 570 } 571 572 static __inline__ __m128 __DEFAULT_FN_ATTRS 573 _mm_loadr_ps(const float *__p) 574 { 575 __m128 __a = _mm_load_ps(__p); 576 return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 577 } 578 579 static __inline__ __m128 __DEFAULT_FN_ATTRS 580 _mm_undefined_ps() 581 { 582 return (__m128)__builtin_ia32_undef128(); 583 } 584 585 static __inline__ __m128 __DEFAULT_FN_ATTRS 586 _mm_set_ss(float __w) 587 { 588 return (__m128){ __w, 0, 0, 0 }; 589 } 590 591 static __inline__ __m128 __DEFAULT_FN_ATTRS 592 _mm_set1_ps(float __w) 593 { 594 return (__m128){ __w, __w, __w, __w }; 595 } 596 597 /* Microsoft specific. */ 598 static __inline__ __m128 __DEFAULT_FN_ATTRS 599 _mm_set_ps1(float __w) 600 { 601 return _mm_set1_ps(__w); 602 } 603 604 static __inline__ __m128 __DEFAULT_FN_ATTRS 605 _mm_set_ps(float __z, float __y, float __x, float __w) 606 { 607 return (__m128){ __w, __x, __y, __z }; 608 } 609 610 static __inline__ __m128 __DEFAULT_FN_ATTRS 611 _mm_setr_ps(float __z, float __y, float __x, float __w) 612 { 613 return (__m128){ __z, __y, __x, __w }; 614 } 615 616 static __inline__ __m128 __DEFAULT_FN_ATTRS 617 _mm_setzero_ps(void) 618 { 619 return (__m128){ 0, 0, 0, 0 }; 620 } 621 622 static __inline__ void __DEFAULT_FN_ATTRS 623 _mm_storeh_pi(__m64 *__p, __m128 __a) 624 { 625 __builtin_ia32_storehps((__v2si *)__p, __a); 626 } 627 628 static __inline__ void __DEFAULT_FN_ATTRS 629 _mm_storel_pi(__m64 *__p, __m128 __a) 630 { 631 __builtin_ia32_storelps((__v2si *)__p, __a); 632 } 633 634 static __inline__ void __DEFAULT_FN_ATTRS 635 _mm_store_ss(float *__p, __m128 __a) 636 { 637 struct __mm_store_ss_struct { 638 float __u; 639 } __attribute__((__packed__, __may_alias__)); 640 ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 641 } 642 643 static __inline__ void __DEFAULT_FN_ATTRS 644 _mm_storeu_ps(float *__p, __m128 __a) 645 { 646 __builtin_ia32_storeups(__p, __a); 647 } 648 649 static __inline__ void __DEFAULT_FN_ATTRS 650 _mm_store1_ps(float *__p, __m128 __a) 651 { 652 __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0); 653 _mm_storeu_ps(__p, __a); 654 } 655 656 static __inline__ void __DEFAULT_FN_ATTRS 657 _mm_store_ps1(float *__p, __m128 __a) 658 { 659 return _mm_store1_ps(__p, __a); 660 } 661 662 static __inline__ void __DEFAULT_FN_ATTRS 663 _mm_store_ps(float *__p, __m128 __a) 664 { 665 *(__m128 *)__p = __a; 666 } 667 668 static __inline__ void __DEFAULT_FN_ATTRS 669 _mm_storer_ps(float *__p, __m128 __a) 670 { 671 __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 672 _mm_store_ps(__p, __a); 673 } 674 675 #define _MM_HINT_T0 3 676 #define _MM_HINT_T1 2 677 #define _MM_HINT_T2 1 678 #define _MM_HINT_NTA 0 679 680 #ifndef _MSC_VER 681 /* FIXME: We have to #define this because "sel" must be a constant integer, and 682 Sema doesn't do any form of constant propagation yet. */ 683 684 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) 685 #endif 686 687 static __inline__ void __DEFAULT_FN_ATTRS 688 _mm_stream_pi(__m64 *__p, __m64 __a) 689 { 690 __builtin_ia32_movntq(__p, __a); 691 } 692 693 static __inline__ void __DEFAULT_FN_ATTRS 694 _mm_stream_ps(float *__p, __m128 __a) 695 { 696 __builtin_ia32_movntps(__p, __a); 697 } 698 699 static __inline__ void __DEFAULT_FN_ATTRS 700 _mm_sfence(void) 701 { 702 __builtin_ia32_sfence(); 703 } 704 705 static __inline__ int __DEFAULT_FN_ATTRS 706 _mm_extract_pi16(__m64 __a, int __n) 707 { 708 __v4hi __b = (__v4hi)__a; 709 return (unsigned short)__b[__n & 3]; 710 } 711 712 static __inline__ __m64 __DEFAULT_FN_ATTRS 713 _mm_insert_pi16(__m64 __a, int __d, int __n) 714 { 715 __v4hi __b = (__v4hi)__a; 716 __b[__n & 3] = __d; 717 return (__m64)__b; 718 } 719 720 static __inline__ __m64 __DEFAULT_FN_ATTRS 721 _mm_max_pi16(__m64 __a, __m64 __b) 722 { 723 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 724 } 725 726 static __inline__ __m64 __DEFAULT_FN_ATTRS 727 _mm_max_pu8(__m64 __a, __m64 __b) 728 { 729 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 730 } 731 732 static __inline__ __m64 __DEFAULT_FN_ATTRS 733 _mm_min_pi16(__m64 __a, __m64 __b) 734 { 735 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 736 } 737 738 static __inline__ __m64 __DEFAULT_FN_ATTRS 739 _mm_min_pu8(__m64 __a, __m64 __b) 740 { 741 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 742 } 743 744 static __inline__ int __DEFAULT_FN_ATTRS 745 _mm_movemask_pi8(__m64 __a) 746 { 747 return __builtin_ia32_pmovmskb((__v8qi)__a); 748 } 749 750 static __inline__ __m64 __DEFAULT_FN_ATTRS 751 _mm_mulhi_pu16(__m64 __a, __m64 __b) 752 { 753 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 754 } 755 756 #define _mm_shuffle_pi16(a, n) __extension__ ({ \ 757 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); }) 758 759 static __inline__ void __DEFAULT_FN_ATTRS 760 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 761 { 762 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 763 } 764 765 static __inline__ __m64 __DEFAULT_FN_ATTRS 766 _mm_avg_pu8(__m64 __a, __m64 __b) 767 { 768 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 769 } 770 771 static __inline__ __m64 __DEFAULT_FN_ATTRS 772 _mm_avg_pu16(__m64 __a, __m64 __b) 773 { 774 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 775 } 776 777 static __inline__ __m64 __DEFAULT_FN_ATTRS 778 _mm_sad_pu8(__m64 __a, __m64 __b) 779 { 780 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 781 } 782 783 static __inline__ unsigned int __DEFAULT_FN_ATTRS 784 _mm_getcsr(void) 785 { 786 return __builtin_ia32_stmxcsr(); 787 } 788 789 static __inline__ void __DEFAULT_FN_ATTRS 790 _mm_setcsr(unsigned int __i) 791 { 792 __builtin_ia32_ldmxcsr(__i); 793 } 794 795 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ 796 (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ 797 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 798 (((mask) & 0x30) >> 4) + 4, \ 799 (((mask) & 0xc0) >> 6) + 4); }) 800 801 static __inline__ __m128 __DEFAULT_FN_ATTRS 802 _mm_unpackhi_ps(__m128 __a, __m128 __b) 803 { 804 return __builtin_shufflevector(__a, __b, 2, 6, 3, 7); 805 } 806 807 static __inline__ __m128 __DEFAULT_FN_ATTRS 808 _mm_unpacklo_ps(__m128 __a, __m128 __b) 809 { 810 return __builtin_shufflevector(__a, __b, 0, 4, 1, 5); 811 } 812 813 static __inline__ __m128 __DEFAULT_FN_ATTRS 814 _mm_move_ss(__m128 __a, __m128 __b) 815 { 816 return __builtin_shufflevector(__a, __b, 4, 1, 2, 3); 817 } 818 819 static __inline__ __m128 __DEFAULT_FN_ATTRS 820 _mm_movehl_ps(__m128 __a, __m128 __b) 821 { 822 return __builtin_shufflevector(__a, __b, 6, 7, 2, 3); 823 } 824 825 static __inline__ __m128 __DEFAULT_FN_ATTRS 826 _mm_movelh_ps(__m128 __a, __m128 __b) 827 { 828 return __builtin_shufflevector(__a, __b, 0, 1, 4, 5); 829 } 830 831 static __inline__ __m128 __DEFAULT_FN_ATTRS 832 _mm_cvtpi16_ps(__m64 __a) 833 { 834 __m64 __b, __c; 835 __m128 __r; 836 837 __b = _mm_setzero_si64(); 838 __b = _mm_cmpgt_pi16(__b, __a); 839 __c = _mm_unpackhi_pi16(__a, __b); 840 __r = _mm_setzero_ps(); 841 __r = _mm_cvtpi32_ps(__r, __c); 842 __r = _mm_movelh_ps(__r, __r); 843 __c = _mm_unpacklo_pi16(__a, __b); 844 __r = _mm_cvtpi32_ps(__r, __c); 845 846 return __r; 847 } 848 849 static __inline__ __m128 __DEFAULT_FN_ATTRS 850 _mm_cvtpu16_ps(__m64 __a) 851 { 852 __m64 __b, __c; 853 __m128 __r; 854 855 __b = _mm_setzero_si64(); 856 __c = _mm_unpackhi_pi16(__a, __b); 857 __r = _mm_setzero_ps(); 858 __r = _mm_cvtpi32_ps(__r, __c); 859 __r = _mm_movelh_ps(__r, __r); 860 __c = _mm_unpacklo_pi16(__a, __b); 861 __r = _mm_cvtpi32_ps(__r, __c); 862 863 return __r; 864 } 865 866 static __inline__ __m128 __DEFAULT_FN_ATTRS 867 _mm_cvtpi8_ps(__m64 __a) 868 { 869 __m64 __b; 870 871 __b = _mm_setzero_si64(); 872 __b = _mm_cmpgt_pi8(__b, __a); 873 __b = _mm_unpacklo_pi8(__a, __b); 874 875 return _mm_cvtpi16_ps(__b); 876 } 877 878 static __inline__ __m128 __DEFAULT_FN_ATTRS 879 _mm_cvtpu8_ps(__m64 __a) 880 { 881 __m64 __b; 882 883 __b = _mm_setzero_si64(); 884 __b = _mm_unpacklo_pi8(__a, __b); 885 886 return _mm_cvtpi16_ps(__b); 887 } 888 889 static __inline__ __m128 __DEFAULT_FN_ATTRS 890 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 891 { 892 __m128 __c; 893 894 __c = _mm_setzero_ps(); 895 __c = _mm_cvtpi32_ps(__c, __b); 896 __c = _mm_movelh_ps(__c, __c); 897 898 return _mm_cvtpi32_ps(__c, __a); 899 } 900 901 static __inline__ __m64 __DEFAULT_FN_ATTRS 902 _mm_cvtps_pi16(__m128 __a) 903 { 904 __m64 __b, __c; 905 906 __b = _mm_cvtps_pi32(__a); 907 __a = _mm_movehl_ps(__a, __a); 908 __c = _mm_cvtps_pi32(__a); 909 910 return _mm_packs_pi32(__b, __c); 911 } 912 913 static __inline__ __m64 __DEFAULT_FN_ATTRS 914 _mm_cvtps_pi8(__m128 __a) 915 { 916 __m64 __b, __c; 917 918 __b = _mm_cvtps_pi16(__a); 919 __c = _mm_setzero_si64(); 920 921 return _mm_packs_pi16(__b, __c); 922 } 923 924 static __inline__ int __DEFAULT_FN_ATTRS 925 _mm_movemask_ps(__m128 __a) 926 { 927 return __builtin_ia32_movmskps(__a); 928 } 929 930 931 #ifdef _MSC_VER 932 #define _MM_ALIGN16 __declspec(align(16)) 933 #endif 934 935 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 936 937 #define _MM_EXCEPT_INVALID (0x0001) 938 #define _MM_EXCEPT_DENORM (0x0002) 939 #define _MM_EXCEPT_DIV_ZERO (0x0004) 940 #define _MM_EXCEPT_OVERFLOW (0x0008) 941 #define _MM_EXCEPT_UNDERFLOW (0x0010) 942 #define _MM_EXCEPT_INEXACT (0x0020) 943 #define _MM_EXCEPT_MASK (0x003f) 944 945 #define _MM_MASK_INVALID (0x0080) 946 #define _MM_MASK_DENORM (0x0100) 947 #define _MM_MASK_DIV_ZERO (0x0200) 948 #define _MM_MASK_OVERFLOW (0x0400) 949 #define _MM_MASK_UNDERFLOW (0x0800) 950 #define _MM_MASK_INEXACT (0x1000) 951 #define _MM_MASK_MASK (0x1f80) 952 953 #define _MM_ROUND_NEAREST (0x0000) 954 #define _MM_ROUND_DOWN (0x2000) 955 #define _MM_ROUND_UP (0x4000) 956 #define _MM_ROUND_TOWARD_ZERO (0x6000) 957 #define _MM_ROUND_MASK (0x6000) 958 959 #define _MM_FLUSH_ZERO_MASK (0x8000) 960 #define _MM_FLUSH_ZERO_ON (0x8000) 961 #define _MM_FLUSH_ZERO_OFF (0x0000) 962 963 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 964 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 965 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 966 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 967 968 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 969 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 970 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 971 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 972 973 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 974 do { \ 975 __m128 tmp3, tmp2, tmp1, tmp0; \ 976 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 977 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 978 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 979 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 980 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 981 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 982 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 983 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 984 } while (0) 985 986 /* Aliases for compatibility. */ 987 #define _m_pextrw _mm_extract_pi16 988 #define _m_pinsrw _mm_insert_pi16 989 #define _m_pmaxsw _mm_max_pi16 990 #define _m_pmaxub _mm_max_pu8 991 #define _m_pminsw _mm_min_pi16 992 #define _m_pminub _mm_min_pu8 993 #define _m_pmovmskb _mm_movemask_pi8 994 #define _m_pmulhuw _mm_mulhi_pu16 995 #define _m_pshufw _mm_shuffle_pi16 996 #define _m_maskmovq _mm_maskmove_si64 997 #define _m_pavgb _mm_avg_pu8 998 #define _m_pavgw _mm_avg_pu16 999 #define _m_psadbw _mm_sad_pu8 1000 #define _m_ _mm_ 1001 #define _m_ _mm_ 1002 1003 #undef __DEFAULT_FN_ATTRS 1004 1005 /* Ugly hack for backwards-compatibility (compatible with gcc) */ 1006 #if defined(__SSE2__) && !__has_feature(modules) 1007 #include <emmintrin.h> 1008 #endif 1009 1010 #endif /* __XMMINTRIN_H */ 1011