1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __XMMINTRIN_H 25 #define __XMMINTRIN_H 26 27 #ifndef __SSE__ 28 #error "SSE instruction set not enabled" 29 #else 30 31 #include <mmintrin.h> 32 33 typedef int __v4si __attribute__((__vector_size__(16))); 34 typedef float __v4sf __attribute__((__vector_size__(16))); 35 typedef float __m128 __attribute__((__vector_size__(16))); 36 37 // This header should only be included in a hosted environment as it depends on 38 // a standard library to provide allocation routines. 39 #if __STDC_HOSTED__ 40 #include <mm_malloc.h> 41 #endif 42 43 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 44 _mm_add_ss(__m128 a, __m128 b) 45 { 46 a[0] += b[0]; 47 return a; 48 } 49 50 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 51 _mm_add_ps(__m128 a, __m128 b) 52 { 53 return a + b; 54 } 55 56 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 57 _mm_sub_ss(__m128 a, __m128 b) 58 { 59 a[0] -= b[0]; 60 return a; 61 } 62 63 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 64 _mm_sub_ps(__m128 a, __m128 b) 65 { 66 return a - b; 67 } 68 69 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 70 _mm_mul_ss(__m128 a, __m128 b) 71 { 72 a[0] *= b[0]; 73 return a; 74 } 75 76 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 77 _mm_mul_ps(__m128 a, __m128 b) 78 { 79 return a * b; 80 } 81 82 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 83 _mm_div_ss(__m128 a, __m128 b) 84 { 85 a[0] /= b[0]; 86 return a; 87 } 88 89 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 90 _mm_div_ps(__m128 a, __m128 b) 91 { 92 return a / b; 93 } 94 95 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 96 _mm_sqrt_ss(__m128 a) 97 { 98 return __builtin_ia32_sqrtss(a); 99 } 100 101 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 102 _mm_sqrt_ps(__m128 a) 103 { 104 return __builtin_ia32_sqrtps(a); 105 } 106 107 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 108 _mm_rcp_ss(__m128 a) 109 { 110 return __builtin_ia32_rcpss(a); 111 } 112 113 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 114 _mm_rcp_ps(__m128 a) 115 { 116 return __builtin_ia32_rcpps(a); 117 } 118 119 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 120 _mm_rsqrt_ss(__m128 a) 121 { 122 return __builtin_ia32_rsqrtss(a); 123 } 124 125 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 126 _mm_rsqrt_ps(__m128 a) 127 { 128 return __builtin_ia32_rsqrtps(a); 129 } 130 131 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 132 _mm_min_ss(__m128 a, __m128 b) 133 { 134 return __builtin_ia32_minss(a, b); 135 } 136 137 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 138 _mm_min_ps(__m128 a, __m128 b) 139 { 140 return __builtin_ia32_minps(a, b); 141 } 142 143 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 144 _mm_max_ss(__m128 a, __m128 b) 145 { 146 return __builtin_ia32_maxss(a, b); 147 } 148 149 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 150 _mm_max_ps(__m128 a, __m128 b) 151 { 152 return __builtin_ia32_maxps(a, b); 153 } 154 155 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 156 _mm_and_ps(__m128 a, __m128 b) 157 { 158 return (__m128)((__v4si)a & (__v4si)b); 159 } 160 161 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 162 _mm_andnot_ps(__m128 a, __m128 b) 163 { 164 return (__m128)(~(__v4si)a & (__v4si)b); 165 } 166 167 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 168 _mm_or_ps(__m128 a, __m128 b) 169 { 170 return (__m128)((__v4si)a | (__v4si)b); 171 } 172 173 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 174 _mm_xor_ps(__m128 a, __m128 b) 175 { 176 return (__m128)((__v4si)a ^ (__v4si)b); 177 } 178 179 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 180 _mm_cmpeq_ss(__m128 a, __m128 b) 181 { 182 return (__m128)__builtin_ia32_cmpss(a, b, 0); 183 } 184 185 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 186 _mm_cmpeq_ps(__m128 a, __m128 b) 187 { 188 return (__m128)__builtin_ia32_cmpps(a, b, 0); 189 } 190 191 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 192 _mm_cmplt_ss(__m128 a, __m128 b) 193 { 194 return (__m128)__builtin_ia32_cmpss(a, b, 1); 195 } 196 197 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 198 _mm_cmplt_ps(__m128 a, __m128 b) 199 { 200 return (__m128)__builtin_ia32_cmpps(a, b, 1); 201 } 202 203 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 204 _mm_cmple_ss(__m128 a, __m128 b) 205 { 206 return (__m128)__builtin_ia32_cmpss(a, b, 2); 207 } 208 209 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 210 _mm_cmple_ps(__m128 a, __m128 b) 211 { 212 return (__m128)__builtin_ia32_cmpps(a, b, 2); 213 } 214 215 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 216 _mm_cmpgt_ss(__m128 a, __m128 b) 217 { 218 return (__m128)__builtin_ia32_cmpss(b, a, 1); 219 } 220 221 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 222 _mm_cmpgt_ps(__m128 a, __m128 b) 223 { 224 return (__m128)__builtin_ia32_cmpps(b, a, 1); 225 } 226 227 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 228 _mm_cmpge_ss(__m128 a, __m128 b) 229 { 230 return (__m128)__builtin_ia32_cmpss(b, a, 2); 231 } 232 233 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 234 _mm_cmpge_ps(__m128 a, __m128 b) 235 { 236 return (__m128)__builtin_ia32_cmpps(b, a, 2); 237 } 238 239 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 240 _mm_cmpneq_ss(__m128 a, __m128 b) 241 { 242 return (__m128)__builtin_ia32_cmpss(a, b, 4); 243 } 244 245 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 246 _mm_cmpneq_ps(__m128 a, __m128 b) 247 { 248 return (__m128)__builtin_ia32_cmpps(a, b, 4); 249 } 250 251 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 252 _mm_cmpnlt_ss(__m128 a, __m128 b) 253 { 254 return (__m128)__builtin_ia32_cmpss(a, b, 5); 255 } 256 257 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 258 _mm_cmpnlt_ps(__m128 a, __m128 b) 259 { 260 return (__m128)__builtin_ia32_cmpps(a, b, 5); 261 } 262 263 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 264 _mm_cmpnle_ss(__m128 a, __m128 b) 265 { 266 return (__m128)__builtin_ia32_cmpss(a, b, 6); 267 } 268 269 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 270 _mm_cmpnle_ps(__m128 a, __m128 b) 271 { 272 return (__m128)__builtin_ia32_cmpps(a, b, 6); 273 } 274 275 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 276 _mm_cmpngt_ss(__m128 a, __m128 b) 277 { 278 return (__m128)__builtin_ia32_cmpss(b, a, 5); 279 } 280 281 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 282 _mm_cmpngt_ps(__m128 a, __m128 b) 283 { 284 return (__m128)__builtin_ia32_cmpps(b, a, 5); 285 } 286 287 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 288 _mm_cmpnge_ss(__m128 a, __m128 b) 289 { 290 return (__m128)__builtin_ia32_cmpss(b, a, 6); 291 } 292 293 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 294 _mm_cmpnge_ps(__m128 a, __m128 b) 295 { 296 return (__m128)__builtin_ia32_cmpps(b, a, 6); 297 } 298 299 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 300 _mm_cmpord_ss(__m128 a, __m128 b) 301 { 302 return (__m128)__builtin_ia32_cmpss(a, b, 7); 303 } 304 305 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 306 _mm_cmpord_ps(__m128 a, __m128 b) 307 { 308 return (__m128)__builtin_ia32_cmpps(a, b, 7); 309 } 310 311 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 312 _mm_cmpunord_ss(__m128 a, __m128 b) 313 { 314 return (__m128)__builtin_ia32_cmpss(a, b, 3); 315 } 316 317 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 318 _mm_cmpunord_ps(__m128 a, __m128 b) 319 { 320 return (__m128)__builtin_ia32_cmpps(a, b, 3); 321 } 322 323 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 324 _mm_comieq_ss(__m128 a, __m128 b) 325 { 326 return __builtin_ia32_comieq(a, b); 327 } 328 329 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 330 _mm_comilt_ss(__m128 a, __m128 b) 331 { 332 return __builtin_ia32_comilt(a, b); 333 } 334 335 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 336 _mm_comile_ss(__m128 a, __m128 b) 337 { 338 return __builtin_ia32_comile(a, b); 339 } 340 341 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 342 _mm_comigt_ss(__m128 a, __m128 b) 343 { 344 return __builtin_ia32_comigt(a, b); 345 } 346 347 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 348 _mm_comige_ss(__m128 a, __m128 b) 349 { 350 return __builtin_ia32_comige(a, b); 351 } 352 353 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 354 _mm_comineq_ss(__m128 a, __m128 b) 355 { 356 return __builtin_ia32_comineq(a, b); 357 } 358 359 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 360 _mm_ucomieq_ss(__m128 a, __m128 b) 361 { 362 return __builtin_ia32_ucomieq(a, b); 363 } 364 365 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 366 _mm_ucomilt_ss(__m128 a, __m128 b) 367 { 368 return __builtin_ia32_ucomilt(a, b); 369 } 370 371 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 372 _mm_ucomile_ss(__m128 a, __m128 b) 373 { 374 return __builtin_ia32_ucomile(a, b); 375 } 376 377 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 378 _mm_ucomigt_ss(__m128 a, __m128 b) 379 { 380 return __builtin_ia32_ucomigt(a, b); 381 } 382 383 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 384 _mm_ucomige_ss(__m128 a, __m128 b) 385 { 386 return __builtin_ia32_ucomige(a, b); 387 } 388 389 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 390 _mm_ucomineq_ss(__m128 a, __m128 b) 391 { 392 return __builtin_ia32_ucomineq(a, b); 393 } 394 395 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 396 _mm_cvtss_si32(__m128 a) 397 { 398 return __builtin_ia32_cvtss2si(a); 399 } 400 401 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 402 _mm_cvt_ss2si(__m128 a) 403 { 404 return _mm_cvtss_si32(a); 405 } 406 407 #ifdef __x86_64__ 408 409 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 410 _mm_cvtss_si64(__m128 a) 411 { 412 return __builtin_ia32_cvtss2si64(a); 413 } 414 415 #endif 416 417 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 418 _mm_cvtps_pi32(__m128 a) 419 { 420 return (__m64)__builtin_ia32_cvtps2pi(a); 421 } 422 423 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 424 _mm_cvt_ps2pi(__m128 a) 425 { 426 return _mm_cvtps_pi32(a); 427 } 428 429 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 430 _mm_cvttss_si32(__m128 a) 431 { 432 return a[0]; 433 } 434 435 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 436 _mm_cvtt_ss2si(__m128 a) 437 { 438 return _mm_cvttss_si32(a); 439 } 440 441 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 442 _mm_cvttss_si64(__m128 a) 443 { 444 return a[0]; 445 } 446 447 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 448 _mm_cvttps_pi32(__m128 a) 449 { 450 return (__m64)__builtin_ia32_cvttps2pi(a); 451 } 452 453 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 454 _mm_cvtt_ps2pi(__m128 a) 455 { 456 return _mm_cvttps_pi32(a); 457 } 458 459 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 460 _mm_cvtsi32_ss(__m128 a, int b) 461 { 462 a[0] = b; 463 return a; 464 } 465 466 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 467 _mm_cvt_si2ss(__m128 a, int b) 468 { 469 return _mm_cvtsi32_ss(a, b); 470 } 471 472 #ifdef __x86_64__ 473 474 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 475 _mm_cvtsi64_ss(__m128 a, long long b) 476 { 477 a[0] = b; 478 return a; 479 } 480 481 #endif 482 483 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 484 _mm_cvtpi32_ps(__m128 a, __m64 b) 485 { 486 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 487 } 488 489 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 490 _mm_cvt_pi2ps(__m128 a, __m64 b) 491 { 492 return _mm_cvtpi32_ps(a, b); 493 } 494 495 static __inline__ float __attribute__((__always_inline__, __nodebug__)) 496 _mm_cvtss_f32(__m128 a) 497 { 498 return a[0]; 499 } 500 501 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 502 _mm_loadh_pi(__m128 a, const __m64 *p) 503 { 504 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 505 struct __mm_loadh_pi_struct { 506 __mm_loadh_pi_v2f32 u; 507 } __attribute__((__packed__, __may_alias__)); 508 __mm_loadh_pi_v2f32 b = ((struct __mm_loadh_pi_struct*)p)->u; 509 __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1); 510 return __builtin_shufflevector(a, bb, 0, 1, 4, 5); 511 } 512 513 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 514 _mm_loadl_pi(__m128 a, const __m64 *p) 515 { 516 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 517 struct __mm_loadl_pi_struct { 518 __mm_loadl_pi_v2f32 u; 519 } __attribute__((__packed__, __may_alias__)); 520 __mm_loadl_pi_v2f32 b = ((struct __mm_loadl_pi_struct*)p)->u; 521 __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1); 522 return __builtin_shufflevector(a, bb, 4, 5, 2, 3); 523 } 524 525 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 526 _mm_load_ss(const float *p) 527 { 528 struct __mm_load_ss_struct { 529 float u; 530 } __attribute__((__packed__, __may_alias__)); 531 float u = ((struct __mm_load_ss_struct*)p)->u; 532 return (__m128){ u, 0, 0, 0 }; 533 } 534 535 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 536 _mm_load1_ps(const float *p) 537 { 538 struct __mm_load1_ps_struct { 539 float u; 540 } __attribute__((__packed__, __may_alias__)); 541 float u = ((struct __mm_load1_ps_struct*)p)->u; 542 return (__m128){ u, u, u, u }; 543 } 544 545 #define _mm_load_ps1(p) _mm_load1_ps(p) 546 547 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 548 _mm_load_ps(const float *p) 549 { 550 return *(__m128*)p; 551 } 552 553 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 554 _mm_loadu_ps(const float *p) 555 { 556 struct __loadu_ps { 557 __m128 v; 558 } __attribute__((__packed__, __may_alias__)); 559 return ((struct __loadu_ps*)p)->v; 560 } 561 562 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 563 _mm_loadr_ps(const float *p) 564 { 565 __m128 a = _mm_load_ps(p); 566 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 567 } 568 569 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 570 _mm_set_ss(float w) 571 { 572 return (__m128){ w, 0, 0, 0 }; 573 } 574 575 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 576 _mm_set1_ps(float w) 577 { 578 return (__m128){ w, w, w, w }; 579 } 580 581 // Microsoft specific. 582 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 583 _mm_set_ps1(float w) 584 { 585 return _mm_set1_ps(w); 586 } 587 588 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 589 _mm_set_ps(float z, float y, float x, float w) 590 { 591 return (__m128){ w, x, y, z }; 592 } 593 594 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 595 _mm_setr_ps(float z, float y, float x, float w) 596 { 597 return (__m128){ z, y, x, w }; 598 } 599 600 static __inline__ __m128 __attribute__((__always_inline__)) 601 _mm_setzero_ps(void) 602 { 603 return (__m128){ 0, 0, 0, 0 }; 604 } 605 606 static __inline__ void __attribute__((__always_inline__)) 607 _mm_storeh_pi(__m64 *p, __m128 a) 608 { 609 __builtin_ia32_storehps((__v2si *)p, a); 610 } 611 612 static __inline__ void __attribute__((__always_inline__)) 613 _mm_storel_pi(__m64 *p, __m128 a) 614 { 615 __builtin_ia32_storelps((__v2si *)p, a); 616 } 617 618 static __inline__ void __attribute__((__always_inline__)) 619 _mm_store_ss(float *p, __m128 a) 620 { 621 struct __mm_store_ss_struct { 622 float u; 623 } __attribute__((__packed__, __may_alias__)); 624 ((struct __mm_store_ss_struct*)p)->u = a[0]; 625 } 626 627 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 628 _mm_storeu_ps(float *p, __m128 a) 629 { 630 __builtin_ia32_storeups(p, a); 631 } 632 633 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 634 _mm_store1_ps(float *p, __m128 a) 635 { 636 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 637 _mm_storeu_ps(p, a); 638 } 639 640 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 641 _mm_store_ps1(float *p, __m128 a) 642 { 643 return _mm_store1_ps(p, a); 644 } 645 646 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 647 _mm_store_ps(float *p, __m128 a) 648 { 649 *(__m128 *)p = a; 650 } 651 652 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 653 _mm_storer_ps(float *p, __m128 a) 654 { 655 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 656 _mm_store_ps(p, a); 657 } 658 659 #define _MM_HINT_T0 3 660 #define _MM_HINT_T1 2 661 #define _MM_HINT_T2 1 662 #define _MM_HINT_NTA 0 663 664 /* FIXME: We have to #define this because "sel" must be a constant integer, and 665 Sema doesn't do any form of constant propagation yet. */ 666 667 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) 668 669 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 670 _mm_stream_pi(__m64 *p, __m64 a) 671 { 672 __builtin_ia32_movntq(p, a); 673 } 674 675 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 676 _mm_stream_ps(float *p, __m128 a) 677 { 678 __builtin_ia32_movntps(p, a); 679 } 680 681 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 682 _mm_sfence(void) 683 { 684 __builtin_ia32_sfence(); 685 } 686 687 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 688 _mm_extract_pi16(__m64 a, int n) 689 { 690 __v4hi b = (__v4hi)a; 691 return (unsigned short)b[n & 3]; 692 } 693 694 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 695 _mm_insert_pi16(__m64 a, int d, int n) 696 { 697 __v4hi b = (__v4hi)a; 698 b[n & 3] = d; 699 return (__m64)b; 700 } 701 702 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 703 _mm_max_pi16(__m64 a, __m64 b) 704 { 705 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 706 } 707 708 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 709 _mm_max_pu8(__m64 a, __m64 b) 710 { 711 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 712 } 713 714 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 715 _mm_min_pi16(__m64 a, __m64 b) 716 { 717 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 718 } 719 720 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 721 _mm_min_pu8(__m64 a, __m64 b) 722 { 723 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 724 } 725 726 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 727 _mm_movemask_pi8(__m64 a) 728 { 729 return __builtin_ia32_pmovmskb((__v8qi)a); 730 } 731 732 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 733 _mm_mulhi_pu16(__m64 a, __m64 b) 734 { 735 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 736 } 737 738 #define _mm_shuffle_pi16(a, n) __extension__ ({ \ 739 __m64 __a = (a); \ 740 (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); }) 741 742 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 743 _mm_maskmove_si64(__m64 d, __m64 n, char *p) 744 { 745 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 746 } 747 748 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 749 _mm_avg_pu8(__m64 a, __m64 b) 750 { 751 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 752 } 753 754 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 755 _mm_avg_pu16(__m64 a, __m64 b) 756 { 757 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 758 } 759 760 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 761 _mm_sad_pu8(__m64 a, __m64 b) 762 { 763 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 764 } 765 766 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 767 _mm_getcsr(void) 768 { 769 return __builtin_ia32_stmxcsr(); 770 } 771 772 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 773 _mm_setcsr(unsigned int i) 774 { 775 __builtin_ia32_ldmxcsr(i); 776 } 777 778 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ 779 __m128 __a = (a); \ 780 __m128 __b = (b); \ 781 (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \ 782 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 783 (((mask) & 0x30) >> 4) + 4, \ 784 (((mask) & 0xc0) >> 6) + 4); }) 785 786 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 787 _mm_unpackhi_ps(__m128 a, __m128 b) 788 { 789 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 790 } 791 792 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 793 _mm_unpacklo_ps(__m128 a, __m128 b) 794 { 795 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 796 } 797 798 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 799 _mm_move_ss(__m128 a, __m128 b) 800 { 801 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 802 } 803 804 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 805 _mm_movehl_ps(__m128 a, __m128 b) 806 { 807 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 808 } 809 810 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 811 _mm_movelh_ps(__m128 a, __m128 b) 812 { 813 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 814 } 815 816 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 817 _mm_cvtpi16_ps(__m64 a) 818 { 819 __m64 b, c; 820 __m128 r; 821 822 b = _mm_setzero_si64(); 823 b = _mm_cmpgt_pi16(b, a); 824 c = _mm_unpackhi_pi16(a, b); 825 r = _mm_setzero_ps(); 826 r = _mm_cvtpi32_ps(r, c); 827 r = _mm_movelh_ps(r, r); 828 c = _mm_unpacklo_pi16(a, b); 829 r = _mm_cvtpi32_ps(r, c); 830 831 return r; 832 } 833 834 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 835 _mm_cvtpu16_ps(__m64 a) 836 { 837 __m64 b, c; 838 __m128 r; 839 840 b = _mm_setzero_si64(); 841 c = _mm_unpackhi_pi16(a, b); 842 r = _mm_setzero_ps(); 843 r = _mm_cvtpi32_ps(r, c); 844 r = _mm_movelh_ps(r, r); 845 c = _mm_unpacklo_pi16(a, b); 846 r = _mm_cvtpi32_ps(r, c); 847 848 return r; 849 } 850 851 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 852 _mm_cvtpi8_ps(__m64 a) 853 { 854 __m64 b; 855 856 b = _mm_setzero_si64(); 857 b = _mm_cmpgt_pi8(b, a); 858 b = _mm_unpacklo_pi8(a, b); 859 860 return _mm_cvtpi16_ps(b); 861 } 862 863 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 864 _mm_cvtpu8_ps(__m64 a) 865 { 866 __m64 b; 867 868 b = _mm_setzero_si64(); 869 b = _mm_unpacklo_pi8(a, b); 870 871 return _mm_cvtpi16_ps(b); 872 } 873 874 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 875 _mm_cvtpi32x2_ps(__m64 a, __m64 b) 876 { 877 __m128 c; 878 879 c = _mm_setzero_ps(); 880 c = _mm_cvtpi32_ps(c, b); 881 c = _mm_movelh_ps(c, c); 882 883 return _mm_cvtpi32_ps(c, a); 884 } 885 886 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 887 _mm_cvtps_pi16(__m128 a) 888 { 889 __m64 b, c; 890 891 b = _mm_cvtps_pi32(a); 892 a = _mm_movehl_ps(a, a); 893 c = _mm_cvtps_pi32(a); 894 895 return _mm_packs_pi16(b, c); 896 } 897 898 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 899 _mm_cvtps_pi8(__m128 a) 900 { 901 __m64 b, c; 902 903 b = _mm_cvtps_pi16(a); 904 c = _mm_setzero_si64(); 905 906 return _mm_packs_pi16(b, c); 907 } 908 909 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 910 _mm_movemask_ps(__m128 a) 911 { 912 return __builtin_ia32_movmskps(a); 913 } 914 915 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 916 917 #define _MM_EXCEPT_INVALID (0x0001) 918 #define _MM_EXCEPT_DENORM (0x0002) 919 #define _MM_EXCEPT_DIV_ZERO (0x0004) 920 #define _MM_EXCEPT_OVERFLOW (0x0008) 921 #define _MM_EXCEPT_UNDERFLOW (0x0010) 922 #define _MM_EXCEPT_INEXACT (0x0020) 923 #define _MM_EXCEPT_MASK (0x003f) 924 925 #define _MM_MASK_INVALID (0x0080) 926 #define _MM_MASK_DENORM (0x0100) 927 #define _MM_MASK_DIV_ZERO (0x0200) 928 #define _MM_MASK_OVERFLOW (0x0400) 929 #define _MM_MASK_UNDERFLOW (0x0800) 930 #define _MM_MASK_INEXACT (0x1000) 931 #define _MM_MASK_MASK (0x1f80) 932 933 #define _MM_ROUND_NEAREST (0x0000) 934 #define _MM_ROUND_DOWN (0x2000) 935 #define _MM_ROUND_UP (0x4000) 936 #define _MM_ROUND_TOWARD_ZERO (0x6000) 937 #define _MM_ROUND_MASK (0x6000) 938 939 #define _MM_FLUSH_ZERO_MASK (0x8000) 940 #define _MM_FLUSH_ZERO_ON (0x8000) 941 #define _MM_FLUSH_ZERO_OFF (0x0000) 942 943 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 944 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 945 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 946 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 947 948 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 949 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 950 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 951 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 952 953 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 954 do { \ 955 __m128 tmp3, tmp2, tmp1, tmp0; \ 956 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 957 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 958 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 959 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 960 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 961 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 962 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 963 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 964 } while (0) 965 966 /* Aliases for compatibility. */ 967 #define _m_pextrw _mm_extract_pi16 968 #define _m_pinsrw _mm_insert_pi16 969 #define _m_pmaxsw _mm_max_pi16 970 #define _m_pmaxub _mm_max_pu8 971 #define _m_pminsw _mm_min_pi16 972 #define _m_pminub _mm_min_pu8 973 #define _m_pmovmskb _mm_movemask_pi8 974 #define _m_pmulhuw _mm_mulhi_pu16 975 #define _m_pshufw _mm_shuffle_pi16 976 #define _m_maskmovq _mm_maskmove_si64 977 #define _m_pavgb _mm_avg_pu8 978 #define _m_pavgw _mm_avg_pu16 979 #define _m_psadbw _mm_sad_pu8 980 #define _m_ _mm_ 981 #define _m_ _mm_ 982 983 /* Ugly hack for backwards-compatibility (compatible with gcc) */ 984 #ifdef __SSE2__ 985 #include <emmintrin.h> 986 #endif 987 988 #endif /* __SSE__ */ 989 990 #endif /* __XMMINTRIN_H */ 991