1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __XMMINTRIN_H 25 #define __XMMINTRIN_H 26 27 #ifndef __SSE__ 28 #error "SSE instruction set not enabled" 29 #else 30 31 #include <mmintrin.h> 32 33 typedef int __v4si __attribute__((__vector_size__(16))); 34 typedef float __v4sf __attribute__((__vector_size__(16))); 35 typedef float __m128 __attribute__((__vector_size__(16))); 36 37 /* This header should only be included in a hosted environment as it depends on 38 * a standard library to provide allocation routines. */ 39 #if __STDC_HOSTED__ 40 #include <mm_malloc.h> 41 #endif 42 43 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 44 _mm_add_ss(__m128 __a, __m128 __b) 45 { 46 __a[0] += __b[0]; 47 return __a; 48 } 49 50 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 51 _mm_add_ps(__m128 __a, __m128 __b) 52 { 53 return __a + __b; 54 } 55 56 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 57 _mm_sub_ss(__m128 __a, __m128 __b) 58 { 59 __a[0] -= __b[0]; 60 return __a; 61 } 62 63 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 64 _mm_sub_ps(__m128 __a, __m128 __b) 65 { 66 return __a - __b; 67 } 68 69 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 70 _mm_mul_ss(__m128 __a, __m128 __b) 71 { 72 __a[0] *= __b[0]; 73 return __a; 74 } 75 76 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 77 _mm_mul_ps(__m128 __a, __m128 __b) 78 { 79 return __a * __b; 80 } 81 82 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 83 _mm_div_ss(__m128 __a, __m128 __b) 84 { 85 __a[0] /= __b[0]; 86 return __a; 87 } 88 89 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 90 _mm_div_ps(__m128 __a, __m128 __b) 91 { 92 return __a / __b; 93 } 94 95 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 96 _mm_sqrt_ss(__m128 __a) 97 { 98 __m128 __c = __builtin_ia32_sqrtss(__a); 99 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 100 } 101 102 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 103 _mm_sqrt_ps(__m128 __a) 104 { 105 return __builtin_ia32_sqrtps(__a); 106 } 107 108 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 109 _mm_rcp_ss(__m128 __a) 110 { 111 __m128 __c = __builtin_ia32_rcpss(__a); 112 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 113 } 114 115 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 116 _mm_rcp_ps(__m128 __a) 117 { 118 return __builtin_ia32_rcpps(__a); 119 } 120 121 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 122 _mm_rsqrt_ss(__m128 __a) 123 { 124 __m128 __c = __builtin_ia32_rsqrtss(__a); 125 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 126 } 127 128 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 129 _mm_rsqrt_ps(__m128 __a) 130 { 131 return __builtin_ia32_rsqrtps(__a); 132 } 133 134 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 135 _mm_min_ss(__m128 __a, __m128 __b) 136 { 137 return __builtin_ia32_minss(__a, __b); 138 } 139 140 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 141 _mm_min_ps(__m128 __a, __m128 __b) 142 { 143 return __builtin_ia32_minps(__a, __b); 144 } 145 146 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 147 _mm_max_ss(__m128 __a, __m128 __b) 148 { 149 return __builtin_ia32_maxss(__a, __b); 150 } 151 152 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 153 _mm_max_ps(__m128 __a, __m128 __b) 154 { 155 return __builtin_ia32_maxps(__a, __b); 156 } 157 158 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 159 _mm_and_ps(__m128 __a, __m128 __b) 160 { 161 return (__m128)((__v4si)__a & (__v4si)__b); 162 } 163 164 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 165 _mm_andnot_ps(__m128 __a, __m128 __b) 166 { 167 return (__m128)(~(__v4si)__a & (__v4si)__b); 168 } 169 170 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 171 _mm_or_ps(__m128 __a, __m128 __b) 172 { 173 return (__m128)((__v4si)__a | (__v4si)__b); 174 } 175 176 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 177 _mm_xor_ps(__m128 __a, __m128 __b) 178 { 179 return (__m128)((__v4si)__a ^ (__v4si)__b); 180 } 181 182 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 183 _mm_cmpeq_ss(__m128 __a, __m128 __b) 184 { 185 return (__m128)__builtin_ia32_cmpss(__a, __b, 0); 186 } 187 188 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 189 _mm_cmpeq_ps(__m128 __a, __m128 __b) 190 { 191 return (__m128)__builtin_ia32_cmpps(__a, __b, 0); 192 } 193 194 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 195 _mm_cmplt_ss(__m128 __a, __m128 __b) 196 { 197 return (__m128)__builtin_ia32_cmpss(__a, __b, 1); 198 } 199 200 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 201 _mm_cmplt_ps(__m128 __a, __m128 __b) 202 { 203 return (__m128)__builtin_ia32_cmpps(__a, __b, 1); 204 } 205 206 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 207 _mm_cmple_ss(__m128 __a, __m128 __b) 208 { 209 return (__m128)__builtin_ia32_cmpss(__a, __b, 2); 210 } 211 212 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 213 _mm_cmple_ps(__m128 __a, __m128 __b) 214 { 215 return (__m128)__builtin_ia32_cmpps(__a, __b, 2); 216 } 217 218 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 219 _mm_cmpgt_ss(__m128 __a, __m128 __b) 220 { 221 return (__m128)__builtin_shufflevector(__a, 222 __builtin_ia32_cmpss(__b, __a, 1), 223 4, 1, 2, 3); 224 } 225 226 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 227 _mm_cmpgt_ps(__m128 __a, __m128 __b) 228 { 229 return (__m128)__builtin_ia32_cmpps(__b, __a, 1); 230 } 231 232 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 233 _mm_cmpge_ss(__m128 __a, __m128 __b) 234 { 235 return (__m128)__builtin_shufflevector(__a, 236 __builtin_ia32_cmpss(__b, __a, 2), 237 4, 1, 2, 3); 238 } 239 240 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 241 _mm_cmpge_ps(__m128 __a, __m128 __b) 242 { 243 return (__m128)__builtin_ia32_cmpps(__b, __a, 2); 244 } 245 246 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 247 _mm_cmpneq_ss(__m128 __a, __m128 __b) 248 { 249 return (__m128)__builtin_ia32_cmpss(__a, __b, 4); 250 } 251 252 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 253 _mm_cmpneq_ps(__m128 __a, __m128 __b) 254 { 255 return (__m128)__builtin_ia32_cmpps(__a, __b, 4); 256 } 257 258 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 259 _mm_cmpnlt_ss(__m128 __a, __m128 __b) 260 { 261 return (__m128)__builtin_ia32_cmpss(__a, __b, 5); 262 } 263 264 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 265 _mm_cmpnlt_ps(__m128 __a, __m128 __b) 266 { 267 return (__m128)__builtin_ia32_cmpps(__a, __b, 5); 268 } 269 270 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 271 _mm_cmpnle_ss(__m128 __a, __m128 __b) 272 { 273 return (__m128)__builtin_ia32_cmpss(__a, __b, 6); 274 } 275 276 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 277 _mm_cmpnle_ps(__m128 __a, __m128 __b) 278 { 279 return (__m128)__builtin_ia32_cmpps(__a, __b, 6); 280 } 281 282 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 283 _mm_cmpngt_ss(__m128 __a, __m128 __b) 284 { 285 return (__m128)__builtin_shufflevector(__a, 286 __builtin_ia32_cmpss(__b, __a, 5), 287 4, 1, 2, 3); 288 } 289 290 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 291 _mm_cmpngt_ps(__m128 __a, __m128 __b) 292 { 293 return (__m128)__builtin_ia32_cmpps(__b, __a, 5); 294 } 295 296 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 297 _mm_cmpnge_ss(__m128 __a, __m128 __b) 298 { 299 return (__m128)__builtin_shufflevector(__a, 300 __builtin_ia32_cmpss(__b, __a, 6), 301 4, 1, 2, 3); 302 } 303 304 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 305 _mm_cmpnge_ps(__m128 __a, __m128 __b) 306 { 307 return (__m128)__builtin_ia32_cmpps(__b, __a, 6); 308 } 309 310 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 311 _mm_cmpord_ss(__m128 __a, __m128 __b) 312 { 313 return (__m128)__builtin_ia32_cmpss(__a, __b, 7); 314 } 315 316 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 317 _mm_cmpord_ps(__m128 __a, __m128 __b) 318 { 319 return (__m128)__builtin_ia32_cmpps(__a, __b, 7); 320 } 321 322 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 323 _mm_cmpunord_ss(__m128 __a, __m128 __b) 324 { 325 return (__m128)__builtin_ia32_cmpss(__a, __b, 3); 326 } 327 328 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 329 _mm_cmpunord_ps(__m128 __a, __m128 __b) 330 { 331 return (__m128)__builtin_ia32_cmpps(__a, __b, 3); 332 } 333 334 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 335 _mm_comieq_ss(__m128 __a, __m128 __b) 336 { 337 return __builtin_ia32_comieq(__a, __b); 338 } 339 340 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 341 _mm_comilt_ss(__m128 __a, __m128 __b) 342 { 343 return __builtin_ia32_comilt(__a, __b); 344 } 345 346 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 347 _mm_comile_ss(__m128 __a, __m128 __b) 348 { 349 return __builtin_ia32_comile(__a, __b); 350 } 351 352 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 353 _mm_comigt_ss(__m128 __a, __m128 __b) 354 { 355 return __builtin_ia32_comigt(__a, __b); 356 } 357 358 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 359 _mm_comige_ss(__m128 __a, __m128 __b) 360 { 361 return __builtin_ia32_comige(__a, __b); 362 } 363 364 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 365 _mm_comineq_ss(__m128 __a, __m128 __b) 366 { 367 return __builtin_ia32_comineq(__a, __b); 368 } 369 370 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 371 _mm_ucomieq_ss(__m128 __a, __m128 __b) 372 { 373 return __builtin_ia32_ucomieq(__a, __b); 374 } 375 376 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 377 _mm_ucomilt_ss(__m128 __a, __m128 __b) 378 { 379 return __builtin_ia32_ucomilt(__a, __b); 380 } 381 382 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 383 _mm_ucomile_ss(__m128 __a, __m128 __b) 384 { 385 return __builtin_ia32_ucomile(__a, __b); 386 } 387 388 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 389 _mm_ucomigt_ss(__m128 __a, __m128 __b) 390 { 391 return __builtin_ia32_ucomigt(__a, __b); 392 } 393 394 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 395 _mm_ucomige_ss(__m128 __a, __m128 __b) 396 { 397 return __builtin_ia32_ucomige(__a, __b); 398 } 399 400 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 401 _mm_ucomineq_ss(__m128 __a, __m128 __b) 402 { 403 return __builtin_ia32_ucomineq(__a, __b); 404 } 405 406 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 407 _mm_cvtss_si32(__m128 __a) 408 { 409 return __builtin_ia32_cvtss2si(__a); 410 } 411 412 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 413 _mm_cvt_ss2si(__m128 __a) 414 { 415 return _mm_cvtss_si32(__a); 416 } 417 418 #ifdef __x86_64__ 419 420 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 421 _mm_cvtss_si64(__m128 __a) 422 { 423 return __builtin_ia32_cvtss2si64(__a); 424 } 425 426 #endif 427 428 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 429 _mm_cvtps_pi32(__m128 __a) 430 { 431 return (__m64)__builtin_ia32_cvtps2pi(__a); 432 } 433 434 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 435 _mm_cvt_ps2pi(__m128 __a) 436 { 437 return _mm_cvtps_pi32(__a); 438 } 439 440 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 441 _mm_cvttss_si32(__m128 __a) 442 { 443 return __a[0]; 444 } 445 446 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 447 _mm_cvtt_ss2si(__m128 __a) 448 { 449 return _mm_cvttss_si32(__a); 450 } 451 452 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 453 _mm_cvttss_si64(__m128 __a) 454 { 455 return __a[0]; 456 } 457 458 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 459 _mm_cvttps_pi32(__m128 __a) 460 { 461 return (__m64)__builtin_ia32_cvttps2pi(__a); 462 } 463 464 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 465 _mm_cvtt_ps2pi(__m128 __a) 466 { 467 return _mm_cvttps_pi32(__a); 468 } 469 470 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 471 _mm_cvtsi32_ss(__m128 __a, int __b) 472 { 473 __a[0] = __b; 474 return __a; 475 } 476 477 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 478 _mm_cvt_si2ss(__m128 __a, int __b) 479 { 480 return _mm_cvtsi32_ss(__a, __b); 481 } 482 483 #ifdef __x86_64__ 484 485 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 486 _mm_cvtsi64_ss(__m128 __a, long long __b) 487 { 488 __a[0] = __b; 489 return __a; 490 } 491 492 #endif 493 494 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 495 _mm_cvtpi32_ps(__m128 __a, __m64 __b) 496 { 497 return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b); 498 } 499 500 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 501 _mm_cvt_pi2ps(__m128 __a, __m64 __b) 502 { 503 return _mm_cvtpi32_ps(__a, __b); 504 } 505 506 static __inline__ float __attribute__((__always_inline__, __nodebug__)) 507 _mm_cvtss_f32(__m128 __a) 508 { 509 return __a[0]; 510 } 511 512 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 513 _mm_loadh_pi(__m128 __a, const __m64 *__p) 514 { 515 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 516 struct __mm_loadh_pi_struct { 517 __mm_loadh_pi_v2f32 __u; 518 } __attribute__((__packed__, __may_alias__)); 519 __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; 520 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 521 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 522 } 523 524 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 525 _mm_loadl_pi(__m128 __a, const __m64 *__p) 526 { 527 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 528 struct __mm_loadl_pi_struct { 529 __mm_loadl_pi_v2f32 __u; 530 } __attribute__((__packed__, __may_alias__)); 531 __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; 532 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 533 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 534 } 535 536 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 537 _mm_load_ss(const float *__p) 538 { 539 struct __mm_load_ss_struct { 540 float __u; 541 } __attribute__((__packed__, __may_alias__)); 542 float __u = ((struct __mm_load_ss_struct*)__p)->__u; 543 return (__m128){ __u, 0, 0, 0 }; 544 } 545 546 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 547 _mm_load1_ps(const float *__p) 548 { 549 struct __mm_load1_ps_struct { 550 float __u; 551 } __attribute__((__packed__, __may_alias__)); 552 float __u = ((struct __mm_load1_ps_struct*)__p)->__u; 553 return (__m128){ __u, __u, __u, __u }; 554 } 555 556 #define _mm_load_ps1(p) _mm_load1_ps(p) 557 558 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 559 _mm_load_ps(const float *__p) 560 { 561 return *(__m128*)__p; 562 } 563 564 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 565 _mm_loadu_ps(const float *__p) 566 { 567 struct __loadu_ps { 568 __m128 __v; 569 } __attribute__((__packed__, __may_alias__)); 570 return ((struct __loadu_ps*)__p)->__v; 571 } 572 573 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 574 _mm_loadr_ps(const float *__p) 575 { 576 __m128 __a = _mm_load_ps(__p); 577 return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 578 } 579 580 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 581 _mm_set_ss(float __w) 582 { 583 return (__m128){ __w, 0, 0, 0 }; 584 } 585 586 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 587 _mm_set1_ps(float __w) 588 { 589 return (__m128){ __w, __w, __w, __w }; 590 } 591 592 /* Microsoft specific. */ 593 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 594 _mm_set_ps1(float __w) 595 { 596 return _mm_set1_ps(__w); 597 } 598 599 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 600 _mm_set_ps(float __z, float __y, float __x, float __w) 601 { 602 return (__m128){ __w, __x, __y, __z }; 603 } 604 605 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 606 _mm_setr_ps(float __z, float __y, float __x, float __w) 607 { 608 return (__m128){ __z, __y, __x, __w }; 609 } 610 611 static __inline__ __m128 __attribute__((__always_inline__)) 612 _mm_setzero_ps(void) 613 { 614 return (__m128){ 0, 0, 0, 0 }; 615 } 616 617 static __inline__ void __attribute__((__always_inline__)) 618 _mm_storeh_pi(__m64 *__p, __m128 __a) 619 { 620 __builtin_ia32_storehps((__v2si *)__p, __a); 621 } 622 623 static __inline__ void __attribute__((__always_inline__)) 624 _mm_storel_pi(__m64 *__p, __m128 __a) 625 { 626 __builtin_ia32_storelps((__v2si *)__p, __a); 627 } 628 629 static __inline__ void __attribute__((__always_inline__)) 630 _mm_store_ss(float *__p, __m128 __a) 631 { 632 struct __mm_store_ss_struct { 633 float __u; 634 } __attribute__((__packed__, __may_alias__)); 635 ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 636 } 637 638 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 639 _mm_storeu_ps(float *__p, __m128 __a) 640 { 641 __builtin_ia32_storeups(__p, __a); 642 } 643 644 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 645 _mm_store1_ps(float *__p, __m128 __a) 646 { 647 __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0); 648 _mm_storeu_ps(__p, __a); 649 } 650 651 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 652 _mm_store_ps1(float *__p, __m128 __a) 653 { 654 return _mm_store1_ps(__p, __a); 655 } 656 657 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 658 _mm_store_ps(float *__p, __m128 __a) 659 { 660 *(__m128 *)__p = __a; 661 } 662 663 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 664 _mm_storer_ps(float *__p, __m128 __a) 665 { 666 __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 667 _mm_store_ps(__p, __a); 668 } 669 670 #define _MM_HINT_T0 3 671 #define _MM_HINT_T1 2 672 #define _MM_HINT_T2 1 673 #define _MM_HINT_NTA 0 674 675 #ifndef _MSC_VER 676 /* FIXME: We have to #define this because "sel" must be a constant integer, and 677 Sema doesn't do any form of constant propagation yet. */ 678 679 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) 680 #endif 681 682 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 683 _mm_stream_pi(__m64 *__p, __m64 __a) 684 { 685 __builtin_ia32_movntq(__p, __a); 686 } 687 688 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 689 _mm_stream_ps(float *__p, __m128 __a) 690 { 691 __builtin_ia32_movntps(__p, __a); 692 } 693 694 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 695 _mm_sfence(void) 696 { 697 __builtin_ia32_sfence(); 698 } 699 700 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 701 _mm_extract_pi16(__m64 __a, int __n) 702 { 703 __v4hi __b = (__v4hi)__a; 704 return (unsigned short)__b[__n & 3]; 705 } 706 707 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 708 _mm_insert_pi16(__m64 __a, int __d, int __n) 709 { 710 __v4hi __b = (__v4hi)__a; 711 __b[__n & 3] = __d; 712 return (__m64)__b; 713 } 714 715 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 716 _mm_max_pi16(__m64 __a, __m64 __b) 717 { 718 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 719 } 720 721 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 722 _mm_max_pu8(__m64 __a, __m64 __b) 723 { 724 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 725 } 726 727 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 728 _mm_min_pi16(__m64 __a, __m64 __b) 729 { 730 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 731 } 732 733 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 734 _mm_min_pu8(__m64 __a, __m64 __b) 735 { 736 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 737 } 738 739 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 740 _mm_movemask_pi8(__m64 __a) 741 { 742 return __builtin_ia32_pmovmskb((__v8qi)__a); 743 } 744 745 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 746 _mm_mulhi_pu16(__m64 __a, __m64 __b) 747 { 748 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 749 } 750 751 #define _mm_shuffle_pi16(a, n) __extension__ ({ \ 752 __m64 __a = (a); \ 753 (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); }) 754 755 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 756 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 757 { 758 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 759 } 760 761 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 762 _mm_avg_pu8(__m64 __a, __m64 __b) 763 { 764 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 765 } 766 767 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 768 _mm_avg_pu16(__m64 __a, __m64 __b) 769 { 770 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 771 } 772 773 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 774 _mm_sad_pu8(__m64 __a, __m64 __b) 775 { 776 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 777 } 778 779 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 780 _mm_getcsr(void) 781 { 782 return __builtin_ia32_stmxcsr(); 783 } 784 785 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 786 _mm_setcsr(unsigned int __i) 787 { 788 __builtin_ia32_ldmxcsr(__i); 789 } 790 791 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ 792 __m128 __a = (a); \ 793 __m128 __b = (b); \ 794 (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \ 795 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 796 (((mask) & 0x30) >> 4) + 4, \ 797 (((mask) & 0xc0) >> 6) + 4); }) 798 799 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 800 _mm_unpackhi_ps(__m128 __a, __m128 __b) 801 { 802 return __builtin_shufflevector(__a, __b, 2, 6, 3, 7); 803 } 804 805 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 806 _mm_unpacklo_ps(__m128 __a, __m128 __b) 807 { 808 return __builtin_shufflevector(__a, __b, 0, 4, 1, 5); 809 } 810 811 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 812 _mm_move_ss(__m128 __a, __m128 __b) 813 { 814 return __builtin_shufflevector(__a, __b, 4, 1, 2, 3); 815 } 816 817 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 818 _mm_movehl_ps(__m128 __a, __m128 __b) 819 { 820 return __builtin_shufflevector(__a, __b, 6, 7, 2, 3); 821 } 822 823 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 824 _mm_movelh_ps(__m128 __a, __m128 __b) 825 { 826 return __builtin_shufflevector(__a, __b, 0, 1, 4, 5); 827 } 828 829 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 830 _mm_cvtpi16_ps(__m64 __a) 831 { 832 __m64 __b, __c; 833 __m128 __r; 834 835 __b = _mm_setzero_si64(); 836 __b = _mm_cmpgt_pi16(__b, __a); 837 __c = _mm_unpackhi_pi16(__a, __b); 838 __r = _mm_setzero_ps(); 839 __r = _mm_cvtpi32_ps(__r, __c); 840 __r = _mm_movelh_ps(__r, __r); 841 __c = _mm_unpacklo_pi16(__a, __b); 842 __r = _mm_cvtpi32_ps(__r, __c); 843 844 return __r; 845 } 846 847 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 848 _mm_cvtpu16_ps(__m64 __a) 849 { 850 __m64 __b, __c; 851 __m128 __r; 852 853 __b = _mm_setzero_si64(); 854 __c = _mm_unpackhi_pi16(__a, __b); 855 __r = _mm_setzero_ps(); 856 __r = _mm_cvtpi32_ps(__r, __c); 857 __r = _mm_movelh_ps(__r, __r); 858 __c = _mm_unpacklo_pi16(__a, __b); 859 __r = _mm_cvtpi32_ps(__r, __c); 860 861 return __r; 862 } 863 864 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 865 _mm_cvtpi8_ps(__m64 __a) 866 { 867 __m64 __b; 868 869 __b = _mm_setzero_si64(); 870 __b = _mm_cmpgt_pi8(__b, __a); 871 __b = _mm_unpacklo_pi8(__a, __b); 872 873 return _mm_cvtpi16_ps(__b); 874 } 875 876 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 877 _mm_cvtpu8_ps(__m64 __a) 878 { 879 __m64 __b; 880 881 __b = _mm_setzero_si64(); 882 __b = _mm_unpacklo_pi8(__a, __b); 883 884 return _mm_cvtpi16_ps(__b); 885 } 886 887 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 888 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 889 { 890 __m128 __c; 891 892 __c = _mm_setzero_ps(); 893 __c = _mm_cvtpi32_ps(__c, __b); 894 __c = _mm_movelh_ps(__c, __c); 895 896 return _mm_cvtpi32_ps(__c, __a); 897 } 898 899 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 900 _mm_cvtps_pi16(__m128 __a) 901 { 902 __m64 __b, __c; 903 904 __b = _mm_cvtps_pi32(__a); 905 __a = _mm_movehl_ps(__a, __a); 906 __c = _mm_cvtps_pi32(__a); 907 908 return _mm_packs_pi32(__b, __c); 909 } 910 911 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 912 _mm_cvtps_pi8(__m128 __a) 913 { 914 __m64 __b, __c; 915 916 __b = _mm_cvtps_pi16(__a); 917 __c = _mm_setzero_si64(); 918 919 return _mm_packs_pi16(__b, __c); 920 } 921 922 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 923 _mm_movemask_ps(__m128 __a) 924 { 925 return __builtin_ia32_movmskps(__a); 926 } 927 928 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 929 930 #define _MM_EXCEPT_INVALID (0x0001) 931 #define _MM_EXCEPT_DENORM (0x0002) 932 #define _MM_EXCEPT_DIV_ZERO (0x0004) 933 #define _MM_EXCEPT_OVERFLOW (0x0008) 934 #define _MM_EXCEPT_UNDERFLOW (0x0010) 935 #define _MM_EXCEPT_INEXACT (0x0020) 936 #define _MM_EXCEPT_MASK (0x003f) 937 938 #define _MM_MASK_INVALID (0x0080) 939 #define _MM_MASK_DENORM (0x0100) 940 #define _MM_MASK_DIV_ZERO (0x0200) 941 #define _MM_MASK_OVERFLOW (0x0400) 942 #define _MM_MASK_UNDERFLOW (0x0800) 943 #define _MM_MASK_INEXACT (0x1000) 944 #define _MM_MASK_MASK (0x1f80) 945 946 #define _MM_ROUND_NEAREST (0x0000) 947 #define _MM_ROUND_DOWN (0x2000) 948 #define _MM_ROUND_UP (0x4000) 949 #define _MM_ROUND_TOWARD_ZERO (0x6000) 950 #define _MM_ROUND_MASK (0x6000) 951 952 #define _MM_FLUSH_ZERO_MASK (0x8000) 953 #define _MM_FLUSH_ZERO_ON (0x8000) 954 #define _MM_FLUSH_ZERO_OFF (0x0000) 955 956 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 957 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 958 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 959 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 960 961 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 962 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 963 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 964 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 965 966 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 967 do { \ 968 __m128 tmp3, tmp2, tmp1, tmp0; \ 969 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 970 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 971 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 972 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 973 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 974 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 975 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 976 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 977 } while (0) 978 979 /* Aliases for compatibility. */ 980 #define _m_pextrw _mm_extract_pi16 981 #define _m_pinsrw _mm_insert_pi16 982 #define _m_pmaxsw _mm_max_pi16 983 #define _m_pmaxub _mm_max_pu8 984 #define _m_pminsw _mm_min_pi16 985 #define _m_pminub _mm_min_pu8 986 #define _m_pmovmskb _mm_movemask_pi8 987 #define _m_pmulhuw _mm_mulhi_pu16 988 #define _m_pshufw _mm_shuffle_pi16 989 #define _m_maskmovq _mm_maskmove_si64 990 #define _m_pavgb _mm_avg_pu8 991 #define _m_pavgw _mm_avg_pu16 992 #define _m_psadbw _mm_sad_pu8 993 #define _m_ _mm_ 994 #define _m_ _mm_ 995 996 /* Ugly hack for backwards-compatibility (compatible with gcc) */ 997 #ifdef __SSE2__ 998 #include <emmintrin.h> 999 #endif 1000 1001 #endif /* __SSE__ */ 1002 1003 #endif /* __XMMINTRIN_H */ 1004