1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __XMMINTRIN_H 25 #define __XMMINTRIN_H 26 27 #ifndef __SSE__ 28 #error "SSE instruction set not enabled" 29 #else 30 31 #include <mmintrin.h> 32 33 typedef int __v4si __attribute__((__vector_size__(16))); 34 typedef float __v4sf __attribute__((__vector_size__(16))); 35 typedef float __m128 __attribute__((__vector_size__(16))); 36 37 #include <mm_malloc.h> 38 39 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 40 _mm_add_ss(__m128 a, __m128 b) 41 { 42 a[0] += b[0]; 43 return a; 44 } 45 46 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 47 _mm_add_ps(__m128 a, __m128 b) 48 { 49 return a + b; 50 } 51 52 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 53 _mm_sub_ss(__m128 a, __m128 b) 54 { 55 a[0] -= b[0]; 56 return a; 57 } 58 59 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 60 _mm_sub_ps(__m128 a, __m128 b) 61 { 62 return a - b; 63 } 64 65 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 66 _mm_mul_ss(__m128 a, __m128 b) 67 { 68 a[0] *= b[0]; 69 return a; 70 } 71 72 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 73 _mm_mul_ps(__m128 a, __m128 b) 74 { 75 return a * b; 76 } 77 78 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 79 _mm_div_ss(__m128 a, __m128 b) 80 { 81 a[0] /= b[0]; 82 return a; 83 } 84 85 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 86 _mm_div_ps(__m128 a, __m128 b) 87 { 88 return a / b; 89 } 90 91 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 92 _mm_sqrt_ss(__m128 a) 93 { 94 return __builtin_ia32_sqrtss(a); 95 } 96 97 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 98 _mm_sqrt_ps(__m128 a) 99 { 100 return __builtin_ia32_sqrtps(a); 101 } 102 103 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 104 _mm_rcp_ss(__m128 a) 105 { 106 return __builtin_ia32_rcpss(a); 107 } 108 109 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 110 _mm_rcp_ps(__m128 a) 111 { 112 return __builtin_ia32_rcpps(a); 113 } 114 115 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 116 _mm_rsqrt_ss(__m128 a) 117 { 118 return __builtin_ia32_rsqrtss(a); 119 } 120 121 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 122 _mm_rsqrt_ps(__m128 a) 123 { 124 return __builtin_ia32_rsqrtps(a); 125 } 126 127 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 128 _mm_min_ss(__m128 a, __m128 b) 129 { 130 return __builtin_ia32_minss(a, b); 131 } 132 133 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 134 _mm_min_ps(__m128 a, __m128 b) 135 { 136 return __builtin_ia32_minps(a, b); 137 } 138 139 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 140 _mm_max_ss(__m128 a, __m128 b) 141 { 142 return __builtin_ia32_maxss(a, b); 143 } 144 145 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 146 _mm_max_ps(__m128 a, __m128 b) 147 { 148 return __builtin_ia32_maxps(a, b); 149 } 150 151 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 152 _mm_and_ps(__m128 a, __m128 b) 153 { 154 return (__m128)((__v4si)a & (__v4si)b); 155 } 156 157 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 158 _mm_andnot_ps(__m128 a, __m128 b) 159 { 160 return (__m128)(~(__v4si)a & (__v4si)b); 161 } 162 163 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 164 _mm_or_ps(__m128 a, __m128 b) 165 { 166 return (__m128)((__v4si)a | (__v4si)b); 167 } 168 169 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 170 _mm_xor_ps(__m128 a, __m128 b) 171 { 172 return (__m128)((__v4si)a ^ (__v4si)b); 173 } 174 175 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 176 _mm_cmpeq_ss(__m128 a, __m128 b) 177 { 178 return (__m128)__builtin_ia32_cmpss(a, b, 0); 179 } 180 181 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 182 _mm_cmpeq_ps(__m128 a, __m128 b) 183 { 184 return (__m128)__builtin_ia32_cmpps(a, b, 0); 185 } 186 187 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 188 _mm_cmplt_ss(__m128 a, __m128 b) 189 { 190 return (__m128)__builtin_ia32_cmpss(a, b, 1); 191 } 192 193 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 194 _mm_cmplt_ps(__m128 a, __m128 b) 195 { 196 return (__m128)__builtin_ia32_cmpps(a, b, 1); 197 } 198 199 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 200 _mm_cmple_ss(__m128 a, __m128 b) 201 { 202 return (__m128)__builtin_ia32_cmpss(a, b, 2); 203 } 204 205 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 206 _mm_cmple_ps(__m128 a, __m128 b) 207 { 208 return (__m128)__builtin_ia32_cmpps(a, b, 2); 209 } 210 211 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 212 _mm_cmpgt_ss(__m128 a, __m128 b) 213 { 214 return (__m128)__builtin_ia32_cmpss(b, a, 1); 215 } 216 217 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 218 _mm_cmpgt_ps(__m128 a, __m128 b) 219 { 220 return (__m128)__builtin_ia32_cmpps(b, a, 1); 221 } 222 223 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 224 _mm_cmpge_ss(__m128 a, __m128 b) 225 { 226 return (__m128)__builtin_ia32_cmpss(b, a, 2); 227 } 228 229 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 230 _mm_cmpge_ps(__m128 a, __m128 b) 231 { 232 return (__m128)__builtin_ia32_cmpps(b, a, 2); 233 } 234 235 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 236 _mm_cmpneq_ss(__m128 a, __m128 b) 237 { 238 return (__m128)__builtin_ia32_cmpss(a, b, 4); 239 } 240 241 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 242 _mm_cmpneq_ps(__m128 a, __m128 b) 243 { 244 return (__m128)__builtin_ia32_cmpps(a, b, 4); 245 } 246 247 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 248 _mm_cmpnlt_ss(__m128 a, __m128 b) 249 { 250 return (__m128)__builtin_ia32_cmpss(a, b, 5); 251 } 252 253 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 254 _mm_cmpnlt_ps(__m128 a, __m128 b) 255 { 256 return (__m128)__builtin_ia32_cmpps(a, b, 5); 257 } 258 259 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 260 _mm_cmpnle_ss(__m128 a, __m128 b) 261 { 262 return (__m128)__builtin_ia32_cmpss(a, b, 6); 263 } 264 265 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 266 _mm_cmpnle_ps(__m128 a, __m128 b) 267 { 268 return (__m128)__builtin_ia32_cmpps(a, b, 6); 269 } 270 271 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 272 _mm_cmpngt_ss(__m128 a, __m128 b) 273 { 274 return (__m128)__builtin_ia32_cmpss(b, a, 5); 275 } 276 277 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 278 _mm_cmpngt_ps(__m128 a, __m128 b) 279 { 280 return (__m128)__builtin_ia32_cmpps(b, a, 5); 281 } 282 283 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 284 _mm_cmpnge_ss(__m128 a, __m128 b) 285 { 286 return (__m128)__builtin_ia32_cmpss(b, a, 6); 287 } 288 289 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 290 _mm_cmpnge_ps(__m128 a, __m128 b) 291 { 292 return (__m128)__builtin_ia32_cmpps(b, a, 6); 293 } 294 295 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 296 _mm_cmpord_ss(__m128 a, __m128 b) 297 { 298 return (__m128)__builtin_ia32_cmpss(a, b, 7); 299 } 300 301 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 302 _mm_cmpord_ps(__m128 a, __m128 b) 303 { 304 return (__m128)__builtin_ia32_cmpps(a, b, 7); 305 } 306 307 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 308 _mm_cmpunord_ss(__m128 a, __m128 b) 309 { 310 return (__m128)__builtin_ia32_cmpss(a, b, 3); 311 } 312 313 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 314 _mm_cmpunord_ps(__m128 a, __m128 b) 315 { 316 return (__m128)__builtin_ia32_cmpps(a, b, 3); 317 } 318 319 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 320 _mm_comieq_ss(__m128 a, __m128 b) 321 { 322 return __builtin_ia32_comieq(a, b); 323 } 324 325 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 326 _mm_comilt_ss(__m128 a, __m128 b) 327 { 328 return __builtin_ia32_comilt(a, b); 329 } 330 331 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 332 _mm_comile_ss(__m128 a, __m128 b) 333 { 334 return __builtin_ia32_comile(a, b); 335 } 336 337 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 338 _mm_comigt_ss(__m128 a, __m128 b) 339 { 340 return __builtin_ia32_comigt(a, b); 341 } 342 343 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 344 _mm_comige_ss(__m128 a, __m128 b) 345 { 346 return __builtin_ia32_comige(a, b); 347 } 348 349 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 350 _mm_comineq_ss(__m128 a, __m128 b) 351 { 352 return __builtin_ia32_comineq(a, b); 353 } 354 355 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 356 _mm_ucomieq_ss(__m128 a, __m128 b) 357 { 358 return __builtin_ia32_ucomieq(a, b); 359 } 360 361 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 362 _mm_ucomilt_ss(__m128 a, __m128 b) 363 { 364 return __builtin_ia32_ucomilt(a, b); 365 } 366 367 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 368 _mm_ucomile_ss(__m128 a, __m128 b) 369 { 370 return __builtin_ia32_ucomile(a, b); 371 } 372 373 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 374 _mm_ucomigt_ss(__m128 a, __m128 b) 375 { 376 return __builtin_ia32_ucomigt(a, b); 377 } 378 379 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 380 _mm_ucomige_ss(__m128 a, __m128 b) 381 { 382 return __builtin_ia32_ucomige(a, b); 383 } 384 385 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 386 _mm_ucomineq_ss(__m128 a, __m128 b) 387 { 388 return __builtin_ia32_ucomineq(a, b); 389 } 390 391 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 392 _mm_cvtss_si32(__m128 a) 393 { 394 return __builtin_ia32_cvtss2si(a); 395 } 396 397 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 398 _mm_cvt_ss2si(__m128 a) 399 { 400 return _mm_cvtss_si32(a); 401 } 402 403 #ifdef __x86_64__ 404 405 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 406 _mm_cvtss_si64(__m128 a) 407 { 408 return __builtin_ia32_cvtss2si64(a); 409 } 410 411 #endif 412 413 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 414 _mm_cvtps_pi32(__m128 a) 415 { 416 return (__m64)__builtin_ia32_cvtps2pi(a); 417 } 418 419 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 420 _mm_cvt_ps2pi(__m128 a) 421 { 422 return _mm_cvtps_pi32(a); 423 } 424 425 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 426 _mm_cvttss_si32(__m128 a) 427 { 428 return a[0]; 429 } 430 431 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 432 _mm_cvtt_ss2si(__m128 a) 433 { 434 return _mm_cvttss_si32(a); 435 } 436 437 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 438 _mm_cvttss_si64(__m128 a) 439 { 440 return a[0]; 441 } 442 443 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 444 _mm_cvttps_pi32(__m128 a) 445 { 446 return (__m64)__builtin_ia32_cvttps2pi(a); 447 } 448 449 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 450 _mm_cvtt_ps2pi(__m128 a) 451 { 452 return _mm_cvttps_pi32(a); 453 } 454 455 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 456 _mm_cvtsi32_ss(__m128 a, int b) 457 { 458 a[0] = b; 459 return a; 460 } 461 462 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 463 _mm_cvt_si2ss(__m128 a, int b) 464 { 465 return _mm_cvtsi32_ss(a, b); 466 } 467 468 #ifdef __x86_64__ 469 470 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 471 _mm_cvtsi64_ss(__m128 a, long long b) 472 { 473 a[0] = b; 474 return a; 475 } 476 477 #endif 478 479 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 480 _mm_cvtpi32_ps(__m128 a, __m64 b) 481 { 482 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 483 } 484 485 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 486 _mm_cvt_pi2ps(__m128 a, __m64 b) 487 { 488 return _mm_cvtpi32_ps(a, b); 489 } 490 491 static __inline__ float __attribute__((__always_inline__, __nodebug__)) 492 _mm_cvtss_f32(__m128 a) 493 { 494 return a[0]; 495 } 496 497 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 498 _mm_loadh_pi(__m128 a, const __m64 *p) 499 { 500 __m128 b; 501 b[0] = *(float*)p; 502 b[1] = *((float*)p+1); 503 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 504 } 505 506 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 507 _mm_loadl_pi(__m128 a, const __m64 *p) 508 { 509 __m128 b; 510 b[0] = *(float*)p; 511 b[1] = *((float*)p+1); 512 return __builtin_shufflevector(a, b, 4, 5, 2, 3); 513 } 514 515 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 516 _mm_load_ss(const float *p) 517 { 518 return (__m128){ *p, 0, 0, 0 }; 519 } 520 521 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 522 _mm_load1_ps(const float *p) 523 { 524 return (__m128){ *p, *p, *p, *p }; 525 } 526 527 #define _mm_load_ps1(p) _mm_load1_ps(p) 528 529 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 530 _mm_load_ps(const float *p) 531 { 532 return *(__m128*)p; 533 } 534 535 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 536 _mm_loadu_ps(const float *p) 537 { 538 return __builtin_ia32_loadups(p); 539 } 540 541 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 542 _mm_loadr_ps(const float *p) 543 { 544 __m128 a = _mm_load_ps(p); 545 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 546 } 547 548 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 549 _mm_set_ss(float w) 550 { 551 return (__m128){ w, 0, 0, 0 }; 552 } 553 554 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 555 _mm_set1_ps(float w) 556 { 557 return (__m128){ w, w, w, w }; 558 } 559 560 // Microsoft specific. 561 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 562 _mm_set_ps1(float w) 563 { 564 return _mm_set1_ps(w); 565 } 566 567 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 568 _mm_set_ps(float z, float y, float x, float w) 569 { 570 return (__m128){ w, x, y, z }; 571 } 572 573 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 574 _mm_setr_ps(float z, float y, float x, float w) 575 { 576 return (__m128){ z, y, x, w }; 577 } 578 579 static __inline__ __m128 __attribute__((__always_inline__)) 580 _mm_setzero_ps(void) 581 { 582 return (__m128){ 0, 0, 0, 0 }; 583 } 584 585 static __inline__ void __attribute__((__always_inline__)) 586 _mm_storeh_pi(__m64 *p, __m128 a) 587 { 588 __builtin_ia32_storehps((__v2si *)p, a); 589 } 590 591 static __inline__ void __attribute__((__always_inline__)) 592 _mm_storel_pi(__m64 *p, __m128 a) 593 { 594 __builtin_ia32_storelps((__v2si *)p, a); 595 } 596 597 static __inline__ void __attribute__((__always_inline__)) 598 _mm_store_ss(float *p, __m128 a) 599 { 600 *p = a[0]; 601 } 602 603 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 604 _mm_storeu_ps(float *p, __m128 a) 605 { 606 __builtin_ia32_storeups(p, a); 607 } 608 609 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 610 _mm_store1_ps(float *p, __m128 a) 611 { 612 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 613 _mm_storeu_ps(p, a); 614 } 615 616 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 617 _mm_store_ps1(float *p, __m128 a) 618 { 619 return _mm_store1_ps(p, a); 620 } 621 622 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 623 _mm_store_ps(float *p, __m128 a) 624 { 625 *(__m128 *)p = a; 626 } 627 628 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 629 _mm_storer_ps(float *p, __m128 a) 630 { 631 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 632 _mm_store_ps(p, a); 633 } 634 635 #define _MM_HINT_T0 3 636 #define _MM_HINT_T1 2 637 #define _MM_HINT_T2 1 638 #define _MM_HINT_NTA 0 639 640 /* FIXME: We have to #define this because "sel" must be a constant integer, and 641 Sema doesn't do any form of constant propagation yet. */ 642 643 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, sel)) 644 645 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 646 _mm_stream_pi(__m64 *p, __m64 a) 647 { 648 __builtin_ia32_movntq(p, a); 649 } 650 651 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 652 _mm_stream_ps(float *p, __m128 a) 653 { 654 __builtin_ia32_movntps(p, a); 655 } 656 657 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 658 _mm_sfence(void) 659 { 660 __builtin_ia32_sfence(); 661 } 662 663 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 664 _mm_extract_pi16(__m64 a, int n) 665 { 666 __v4hi b = (__v4hi)a; 667 return (unsigned short)b[n & 3]; 668 } 669 670 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 671 _mm_insert_pi16(__m64 a, int d, int n) 672 { 673 __v4hi b = (__v4hi)a; 674 b[n & 3] = d; 675 return (__m64)b; 676 } 677 678 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 679 _mm_max_pi16(__m64 a, __m64 b) 680 { 681 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 682 } 683 684 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 685 _mm_max_pu8(__m64 a, __m64 b) 686 { 687 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 688 } 689 690 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 691 _mm_min_pi16(__m64 a, __m64 b) 692 { 693 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 694 } 695 696 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 697 _mm_min_pu8(__m64 a, __m64 b) 698 { 699 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 700 } 701 702 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 703 _mm_movemask_pi8(__m64 a) 704 { 705 return __builtin_ia32_pmovmskb((__v8qi)a); 706 } 707 708 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 709 _mm_mulhi_pu16(__m64 a, __m64 b) 710 { 711 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 712 } 713 714 #define _mm_shuffle_pi16(a, n) \ 715 ((__m64)__builtin_ia32_pshufw(a, n)) 716 717 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 718 _mm_maskmove_si64(__m64 d, __m64 n, char *p) 719 { 720 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 721 } 722 723 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 724 _mm_avg_pu8(__m64 a, __m64 b) 725 { 726 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 727 } 728 729 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 730 _mm_avg_pu16(__m64 a, __m64 b) 731 { 732 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 733 } 734 735 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 736 _mm_sad_pu8(__m64 a, __m64 b) 737 { 738 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 739 } 740 741 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 742 _mm_getcsr(void) 743 { 744 return __builtin_ia32_stmxcsr(); 745 } 746 747 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 748 _mm_setcsr(unsigned int i) 749 { 750 __builtin_ia32_ldmxcsr(i); 751 } 752 753 #define _mm_shuffle_ps(a, b, mask) \ 754 (__builtin_shufflevector((__v4sf)(a), (__v4sf)(b), \ 755 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 756 (((mask) & 0x30) >> 4) + 4, \ 757 (((mask) & 0xc0) >> 6) + 4)) 758 759 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 760 _mm_unpackhi_ps(__m128 a, __m128 b) 761 { 762 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 763 } 764 765 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 766 _mm_unpacklo_ps(__m128 a, __m128 b) 767 { 768 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 769 } 770 771 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 772 _mm_move_ss(__m128 a, __m128 b) 773 { 774 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 775 } 776 777 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 778 _mm_movehl_ps(__m128 a, __m128 b) 779 { 780 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 781 } 782 783 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 784 _mm_movelh_ps(__m128 a, __m128 b) 785 { 786 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 787 } 788 789 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 790 _mm_cvtpi16_ps(__m64 a) 791 { 792 __m64 b, c; 793 __m128 r; 794 795 b = _mm_setzero_si64(); 796 b = _mm_cmpgt_pi16(b, a); 797 c = _mm_unpackhi_pi16(a, b); 798 r = _mm_setzero_ps(); 799 r = _mm_cvtpi32_ps(r, c); 800 r = _mm_movelh_ps(r, r); 801 c = _mm_unpacklo_pi16(a, b); 802 r = _mm_cvtpi32_ps(r, c); 803 804 return r; 805 } 806 807 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 808 _mm_cvtpu16_ps(__m64 a) 809 { 810 __m64 b, c; 811 __m128 r; 812 813 b = _mm_setzero_si64(); 814 c = _mm_unpackhi_pi16(a, b); 815 r = _mm_setzero_ps(); 816 r = _mm_cvtpi32_ps(r, c); 817 r = _mm_movelh_ps(r, r); 818 c = _mm_unpacklo_pi16(a, b); 819 r = _mm_cvtpi32_ps(r, c); 820 821 return r; 822 } 823 824 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 825 _mm_cvtpi8_ps(__m64 a) 826 { 827 __m64 b; 828 829 b = _mm_setzero_si64(); 830 b = _mm_cmpgt_pi8(b, a); 831 b = _mm_unpacklo_pi8(a, b); 832 833 return _mm_cvtpi16_ps(b); 834 } 835 836 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 837 _mm_cvtpu8_ps(__m64 a) 838 { 839 __m64 b; 840 841 b = _mm_setzero_si64(); 842 b = _mm_unpacklo_pi8(a, b); 843 844 return _mm_cvtpi16_ps(b); 845 } 846 847 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 848 _mm_cvtpi32x2_ps(__m64 a, __m64 b) 849 { 850 __m128 c; 851 852 c = _mm_setzero_ps(); 853 c = _mm_cvtpi32_ps(c, b); 854 c = _mm_movelh_ps(c, c); 855 856 return _mm_cvtpi32_ps(c, a); 857 } 858 859 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 860 _mm_cvtps_pi16(__m128 a) 861 { 862 __m64 b, c; 863 864 b = _mm_cvtps_pi32(a); 865 a = _mm_movehl_ps(a, a); 866 c = _mm_cvtps_pi32(a); 867 868 return _mm_packs_pi16(b, c); 869 } 870 871 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 872 _mm_cvtps_pi8(__m128 a) 873 { 874 __m64 b, c; 875 876 b = _mm_cvtps_pi16(a); 877 c = _mm_setzero_si64(); 878 879 return _mm_packs_pi16(b, c); 880 } 881 882 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 883 _mm_movemask_ps(__m128 a) 884 { 885 return __builtin_ia32_movmskps(a); 886 } 887 888 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 889 890 #define _MM_EXCEPT_INVALID (0x0001) 891 #define _MM_EXCEPT_DENORM (0x0002) 892 #define _MM_EXCEPT_DIV_ZERO (0x0004) 893 #define _MM_EXCEPT_OVERFLOW (0x0008) 894 #define _MM_EXCEPT_UNDERFLOW (0x0010) 895 #define _MM_EXCEPT_INEXACT (0x0020) 896 #define _MM_EXCEPT_MASK (0x003f) 897 898 #define _MM_MASK_INVALID (0x0080) 899 #define _MM_MASK_DENORM (0x0100) 900 #define _MM_MASK_DIV_ZERO (0x0200) 901 #define _MM_MASK_OVERFLOW (0x0400) 902 #define _MM_MASK_UNDERFLOW (0x0800) 903 #define _MM_MASK_INEXACT (0x1000) 904 #define _MM_MASK_MASK (0x1f80) 905 906 #define _MM_ROUND_NEAREST (0x0000) 907 #define _MM_ROUND_DOWN (0x2000) 908 #define _MM_ROUND_UP (0x4000) 909 #define _MM_ROUND_TOWARD_ZERO (0x6000) 910 #define _MM_ROUND_MASK (0x6000) 911 912 #define _MM_FLUSH_ZERO_MASK (0x8000) 913 #define _MM_FLUSH_ZERO_ON (0x8000) 914 #define _MM_FLUSH_ZERO_OFF (0x8000) 915 916 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 917 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 918 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 919 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 920 921 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 922 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 923 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 924 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 925 926 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 927 do { \ 928 __m128 tmp3, tmp2, tmp1, tmp0; \ 929 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 930 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 931 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 932 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 933 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 934 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 935 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 936 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 937 } while (0) 938 939 /* Aliases for compatibility. */ 940 #define _m_pextrw _mm_extract_pi16 941 #define _m_pinsrw _mm_insert_pi16 942 #define _m_pmaxsw _mm_max_pi16 943 #define _m_pmaxub _mm_max_pu8 944 #define _m_pminsw _mm_min_pi16 945 #define _m_pminub _mm_min_pu8 946 #define _m_pmovmskb _mm_movemask_pi8 947 #define _m_pmulhuw _mm_mulhi_pu16 948 #define _m_pshufw _mm_shuffle_pi16 949 #define _m_maskmovq _mm_maskmove_si64 950 #define _m_pavgb _mm_avg_pu8 951 #define _m_pavgw _mm_avg_pu16 952 #define _m_psadbw _mm_sad_pu8 953 #define _m_ _mm_ 954 #define _m_ _mm_ 955 956 /* Ugly hack for backwards-compatibility (compatible with gcc) */ 957 #ifdef __SSE2__ 958 #include <emmintrin.h> 959 #endif 960 961 #endif /* __SSE__ */ 962 963 #endif /* __XMMINTRIN_H */ 964