1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __EMMINTRIN_H 25 #define __EMMINTRIN_H 26 27 #include <xmmintrin.h> 28 29 typedef double __m128d __attribute__((__vector_size__(16))); 30 typedef long long __m128i __attribute__((__vector_size__(16))); 31 32 /* Type defines. */ 33 typedef double __v2df __attribute__ ((__vector_size__ (16))); 34 typedef long long __v2di __attribute__ ((__vector_size__ (16))); 35 typedef short __v8hi __attribute__((__vector_size__(16))); 36 typedef char __v16qi __attribute__((__vector_size__(16))); 37 38 /* We need an explicitly signed variant for char. Note that this shouldn't 39 * appear in the interface though. */ 40 typedef signed char __v16qs __attribute__((__vector_size__(16))); 41 42 #include <f16cintrin.h> 43 44 /* Define the default attributes for the functions in this file. */ 45 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 46 47 static __inline__ __m128d __DEFAULT_FN_ATTRS 48 _mm_add_sd(__m128d __a, __m128d __b) 49 { 50 __a[0] += __b[0]; 51 return __a; 52 } 53 54 static __inline__ __m128d __DEFAULT_FN_ATTRS 55 _mm_add_pd(__m128d __a, __m128d __b) 56 { 57 return __a + __b; 58 } 59 60 static __inline__ __m128d __DEFAULT_FN_ATTRS 61 _mm_sub_sd(__m128d __a, __m128d __b) 62 { 63 __a[0] -= __b[0]; 64 return __a; 65 } 66 67 static __inline__ __m128d __DEFAULT_FN_ATTRS 68 _mm_sub_pd(__m128d __a, __m128d __b) 69 { 70 return __a - __b; 71 } 72 73 static __inline__ __m128d __DEFAULT_FN_ATTRS 74 _mm_mul_sd(__m128d __a, __m128d __b) 75 { 76 __a[0] *= __b[0]; 77 return __a; 78 } 79 80 static __inline__ __m128d __DEFAULT_FN_ATTRS 81 _mm_mul_pd(__m128d __a, __m128d __b) 82 { 83 return __a * __b; 84 } 85 86 static __inline__ __m128d __DEFAULT_FN_ATTRS 87 _mm_div_sd(__m128d __a, __m128d __b) 88 { 89 __a[0] /= __b[0]; 90 return __a; 91 } 92 93 static __inline__ __m128d __DEFAULT_FN_ATTRS 94 _mm_div_pd(__m128d __a, __m128d __b) 95 { 96 return __a / __b; 97 } 98 99 static __inline__ __m128d __DEFAULT_FN_ATTRS 100 _mm_sqrt_sd(__m128d __a, __m128d __b) 101 { 102 __m128d __c = __builtin_ia32_sqrtsd(__b); 103 return (__m128d) { __c[0], __a[1] }; 104 } 105 106 static __inline__ __m128d __DEFAULT_FN_ATTRS 107 _mm_sqrt_pd(__m128d __a) 108 { 109 return __builtin_ia32_sqrtpd(__a); 110 } 111 112 static __inline__ __m128d __DEFAULT_FN_ATTRS 113 _mm_min_sd(__m128d __a, __m128d __b) 114 { 115 return __builtin_ia32_minsd(__a, __b); 116 } 117 118 static __inline__ __m128d __DEFAULT_FN_ATTRS 119 _mm_min_pd(__m128d __a, __m128d __b) 120 { 121 return __builtin_ia32_minpd(__a, __b); 122 } 123 124 static __inline__ __m128d __DEFAULT_FN_ATTRS 125 _mm_max_sd(__m128d __a, __m128d __b) 126 { 127 return __builtin_ia32_maxsd(__a, __b); 128 } 129 130 static __inline__ __m128d __DEFAULT_FN_ATTRS 131 _mm_max_pd(__m128d __a, __m128d __b) 132 { 133 return __builtin_ia32_maxpd(__a, __b); 134 } 135 136 static __inline__ __m128d __DEFAULT_FN_ATTRS 137 _mm_and_pd(__m128d __a, __m128d __b) 138 { 139 return (__m128d)((__v4si)__a & (__v4si)__b); 140 } 141 142 static __inline__ __m128d __DEFAULT_FN_ATTRS 143 _mm_andnot_pd(__m128d __a, __m128d __b) 144 { 145 return (__m128d)(~(__v4si)__a & (__v4si)__b); 146 } 147 148 static __inline__ __m128d __DEFAULT_FN_ATTRS 149 _mm_or_pd(__m128d __a, __m128d __b) 150 { 151 return (__m128d)((__v4si)__a | (__v4si)__b); 152 } 153 154 static __inline__ __m128d __DEFAULT_FN_ATTRS 155 _mm_xor_pd(__m128d __a, __m128d __b) 156 { 157 return (__m128d)((__v4si)__a ^ (__v4si)__b); 158 } 159 160 static __inline__ __m128d __DEFAULT_FN_ATTRS 161 _mm_cmpeq_pd(__m128d __a, __m128d __b) 162 { 163 return (__m128d)__builtin_ia32_cmpeqpd(__a, __b); 164 } 165 166 static __inline__ __m128d __DEFAULT_FN_ATTRS 167 _mm_cmplt_pd(__m128d __a, __m128d __b) 168 { 169 return (__m128d)__builtin_ia32_cmpltpd(__a, __b); 170 } 171 172 static __inline__ __m128d __DEFAULT_FN_ATTRS 173 _mm_cmple_pd(__m128d __a, __m128d __b) 174 { 175 return (__m128d)__builtin_ia32_cmplepd(__a, __b); 176 } 177 178 static __inline__ __m128d __DEFAULT_FN_ATTRS 179 _mm_cmpgt_pd(__m128d __a, __m128d __b) 180 { 181 return (__m128d)__builtin_ia32_cmpltpd(__b, __a); 182 } 183 184 static __inline__ __m128d __DEFAULT_FN_ATTRS 185 _mm_cmpge_pd(__m128d __a, __m128d __b) 186 { 187 return (__m128d)__builtin_ia32_cmplepd(__b, __a); 188 } 189 190 static __inline__ __m128d __DEFAULT_FN_ATTRS 191 _mm_cmpord_pd(__m128d __a, __m128d __b) 192 { 193 return (__m128d)__builtin_ia32_cmpordpd(__a, __b); 194 } 195 196 static __inline__ __m128d __DEFAULT_FN_ATTRS 197 _mm_cmpunord_pd(__m128d __a, __m128d __b) 198 { 199 return (__m128d)__builtin_ia32_cmpunordpd(__a, __b); 200 } 201 202 static __inline__ __m128d __DEFAULT_FN_ATTRS 203 _mm_cmpneq_pd(__m128d __a, __m128d __b) 204 { 205 return (__m128d)__builtin_ia32_cmpneqpd(__a, __b); 206 } 207 208 static __inline__ __m128d __DEFAULT_FN_ATTRS 209 _mm_cmpnlt_pd(__m128d __a, __m128d __b) 210 { 211 return (__m128d)__builtin_ia32_cmpnltpd(__a, __b); 212 } 213 214 static __inline__ __m128d __DEFAULT_FN_ATTRS 215 _mm_cmpnle_pd(__m128d __a, __m128d __b) 216 { 217 return (__m128d)__builtin_ia32_cmpnlepd(__a, __b); 218 } 219 220 static __inline__ __m128d __DEFAULT_FN_ATTRS 221 _mm_cmpngt_pd(__m128d __a, __m128d __b) 222 { 223 return (__m128d)__builtin_ia32_cmpnltpd(__b, __a); 224 } 225 226 static __inline__ __m128d __DEFAULT_FN_ATTRS 227 _mm_cmpnge_pd(__m128d __a, __m128d __b) 228 { 229 return (__m128d)__builtin_ia32_cmpnlepd(__b, __a); 230 } 231 232 static __inline__ __m128d __DEFAULT_FN_ATTRS 233 _mm_cmpeq_sd(__m128d __a, __m128d __b) 234 { 235 return (__m128d)__builtin_ia32_cmpeqsd(__a, __b); 236 } 237 238 static __inline__ __m128d __DEFAULT_FN_ATTRS 239 _mm_cmplt_sd(__m128d __a, __m128d __b) 240 { 241 return (__m128d)__builtin_ia32_cmpltsd(__a, __b); 242 } 243 244 static __inline__ __m128d __DEFAULT_FN_ATTRS 245 _mm_cmple_sd(__m128d __a, __m128d __b) 246 { 247 return (__m128d)__builtin_ia32_cmplesd(__a, __b); 248 } 249 250 static __inline__ __m128d __DEFAULT_FN_ATTRS 251 _mm_cmpgt_sd(__m128d __a, __m128d __b) 252 { 253 __m128d __c = __builtin_ia32_cmpltsd(__b, __a); 254 return (__m128d) { __c[0], __a[1] }; 255 } 256 257 static __inline__ __m128d __DEFAULT_FN_ATTRS 258 _mm_cmpge_sd(__m128d __a, __m128d __b) 259 { 260 __m128d __c = __builtin_ia32_cmplesd(__b, __a); 261 return (__m128d) { __c[0], __a[1] }; 262 } 263 264 static __inline__ __m128d __DEFAULT_FN_ATTRS 265 _mm_cmpord_sd(__m128d __a, __m128d __b) 266 { 267 return (__m128d)__builtin_ia32_cmpordsd(__a, __b); 268 } 269 270 static __inline__ __m128d __DEFAULT_FN_ATTRS 271 _mm_cmpunord_sd(__m128d __a, __m128d __b) 272 { 273 return (__m128d)__builtin_ia32_cmpunordsd(__a, __b); 274 } 275 276 static __inline__ __m128d __DEFAULT_FN_ATTRS 277 _mm_cmpneq_sd(__m128d __a, __m128d __b) 278 { 279 return (__m128d)__builtin_ia32_cmpneqsd(__a, __b); 280 } 281 282 static __inline__ __m128d __DEFAULT_FN_ATTRS 283 _mm_cmpnlt_sd(__m128d __a, __m128d __b) 284 { 285 return (__m128d)__builtin_ia32_cmpnltsd(__a, __b); 286 } 287 288 static __inline__ __m128d __DEFAULT_FN_ATTRS 289 _mm_cmpnle_sd(__m128d __a, __m128d __b) 290 { 291 return (__m128d)__builtin_ia32_cmpnlesd(__a, __b); 292 } 293 294 static __inline__ __m128d __DEFAULT_FN_ATTRS 295 _mm_cmpngt_sd(__m128d __a, __m128d __b) 296 { 297 __m128d __c = __builtin_ia32_cmpnltsd(__b, __a); 298 return (__m128d) { __c[0], __a[1] }; 299 } 300 301 static __inline__ __m128d __DEFAULT_FN_ATTRS 302 _mm_cmpnge_sd(__m128d __a, __m128d __b) 303 { 304 __m128d __c = __builtin_ia32_cmpnlesd(__b, __a); 305 return (__m128d) { __c[0], __a[1] }; 306 } 307 308 static __inline__ int __DEFAULT_FN_ATTRS 309 _mm_comieq_sd(__m128d __a, __m128d __b) 310 { 311 return __builtin_ia32_comisdeq(__a, __b); 312 } 313 314 static __inline__ int __DEFAULT_FN_ATTRS 315 _mm_comilt_sd(__m128d __a, __m128d __b) 316 { 317 return __builtin_ia32_comisdlt(__a, __b); 318 } 319 320 static __inline__ int __DEFAULT_FN_ATTRS 321 _mm_comile_sd(__m128d __a, __m128d __b) 322 { 323 return __builtin_ia32_comisdle(__a, __b); 324 } 325 326 static __inline__ int __DEFAULT_FN_ATTRS 327 _mm_comigt_sd(__m128d __a, __m128d __b) 328 { 329 return __builtin_ia32_comisdgt(__a, __b); 330 } 331 332 static __inline__ int __DEFAULT_FN_ATTRS 333 _mm_comige_sd(__m128d __a, __m128d __b) 334 { 335 return __builtin_ia32_comisdge(__a, __b); 336 } 337 338 static __inline__ int __DEFAULT_FN_ATTRS 339 _mm_comineq_sd(__m128d __a, __m128d __b) 340 { 341 return __builtin_ia32_comisdneq(__a, __b); 342 } 343 344 static __inline__ int __DEFAULT_FN_ATTRS 345 _mm_ucomieq_sd(__m128d __a, __m128d __b) 346 { 347 return __builtin_ia32_ucomisdeq(__a, __b); 348 } 349 350 static __inline__ int __DEFAULT_FN_ATTRS 351 _mm_ucomilt_sd(__m128d __a, __m128d __b) 352 { 353 return __builtin_ia32_ucomisdlt(__a, __b); 354 } 355 356 static __inline__ int __DEFAULT_FN_ATTRS 357 _mm_ucomile_sd(__m128d __a, __m128d __b) 358 { 359 return __builtin_ia32_ucomisdle(__a, __b); 360 } 361 362 static __inline__ int __DEFAULT_FN_ATTRS 363 _mm_ucomigt_sd(__m128d __a, __m128d __b) 364 { 365 return __builtin_ia32_ucomisdgt(__a, __b); 366 } 367 368 static __inline__ int __DEFAULT_FN_ATTRS 369 _mm_ucomige_sd(__m128d __a, __m128d __b) 370 { 371 return __builtin_ia32_ucomisdge(__a, __b); 372 } 373 374 static __inline__ int __DEFAULT_FN_ATTRS 375 _mm_ucomineq_sd(__m128d __a, __m128d __b) 376 { 377 return __builtin_ia32_ucomisdneq(__a, __b); 378 } 379 380 static __inline__ __m128 __DEFAULT_FN_ATTRS 381 _mm_cvtpd_ps(__m128d __a) 382 { 383 return __builtin_ia32_cvtpd2ps(__a); 384 } 385 386 static __inline__ __m128d __DEFAULT_FN_ATTRS 387 _mm_cvtps_pd(__m128 __a) 388 { 389 return __builtin_ia32_cvtps2pd(__a); 390 } 391 392 static __inline__ __m128d __DEFAULT_FN_ATTRS 393 _mm_cvtepi32_pd(__m128i __a) 394 { 395 return __builtin_ia32_cvtdq2pd((__v4si)__a); 396 } 397 398 static __inline__ __m128i __DEFAULT_FN_ATTRS 399 _mm_cvtpd_epi32(__m128d __a) 400 { 401 return __builtin_ia32_cvtpd2dq(__a); 402 } 403 404 static __inline__ int __DEFAULT_FN_ATTRS 405 _mm_cvtsd_si32(__m128d __a) 406 { 407 return __builtin_ia32_cvtsd2si(__a); 408 } 409 410 static __inline__ __m128 __DEFAULT_FN_ATTRS 411 _mm_cvtsd_ss(__m128 __a, __m128d __b) 412 { 413 __a[0] = __b[0]; 414 return __a; 415 } 416 417 static __inline__ __m128d __DEFAULT_FN_ATTRS 418 _mm_cvtsi32_sd(__m128d __a, int __b) 419 { 420 __a[0] = __b; 421 return __a; 422 } 423 424 static __inline__ __m128d __DEFAULT_FN_ATTRS 425 _mm_cvtss_sd(__m128d __a, __m128 __b) 426 { 427 __a[0] = __b[0]; 428 return __a; 429 } 430 431 static __inline__ __m128i __DEFAULT_FN_ATTRS 432 _mm_cvttpd_epi32(__m128d __a) 433 { 434 return (__m128i)__builtin_ia32_cvttpd2dq(__a); 435 } 436 437 static __inline__ int __DEFAULT_FN_ATTRS 438 _mm_cvttsd_si32(__m128d __a) 439 { 440 return __a[0]; 441 } 442 443 static __inline__ __m64 __DEFAULT_FN_ATTRS 444 _mm_cvtpd_pi32(__m128d __a) 445 { 446 return (__m64)__builtin_ia32_cvtpd2pi(__a); 447 } 448 449 static __inline__ __m64 __DEFAULT_FN_ATTRS 450 _mm_cvttpd_pi32(__m128d __a) 451 { 452 return (__m64)__builtin_ia32_cvttpd2pi(__a); 453 } 454 455 static __inline__ __m128d __DEFAULT_FN_ATTRS 456 _mm_cvtpi32_pd(__m64 __a) 457 { 458 return __builtin_ia32_cvtpi2pd((__v2si)__a); 459 } 460 461 static __inline__ double __DEFAULT_FN_ATTRS 462 _mm_cvtsd_f64(__m128d __a) 463 { 464 return __a[0]; 465 } 466 467 static __inline__ __m128d __DEFAULT_FN_ATTRS 468 _mm_load_pd(double const *__dp) 469 { 470 return *(__m128d*)__dp; 471 } 472 473 static __inline__ __m128d __DEFAULT_FN_ATTRS 474 _mm_load1_pd(double const *__dp) 475 { 476 struct __mm_load1_pd_struct { 477 double __u; 478 } __attribute__((__packed__, __may_alias__)); 479 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 480 return (__m128d){ __u, __u }; 481 } 482 483 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 484 485 static __inline__ __m128d __DEFAULT_FN_ATTRS 486 _mm_loadr_pd(double const *__dp) 487 { 488 __m128d __u = *(__m128d*)__dp; 489 return __builtin_shufflevector(__u, __u, 1, 0); 490 } 491 492 static __inline__ __m128d __DEFAULT_FN_ATTRS 493 _mm_loadu_pd(double const *__dp) 494 { 495 struct __loadu_pd { 496 __m128d __v; 497 } __attribute__((__packed__, __may_alias__)); 498 return ((struct __loadu_pd*)__dp)->__v; 499 } 500 501 static __inline__ __m128d __DEFAULT_FN_ATTRS 502 _mm_load_sd(double const *__dp) 503 { 504 struct __mm_load_sd_struct { 505 double __u; 506 } __attribute__((__packed__, __may_alias__)); 507 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 508 return (__m128d){ __u, 0 }; 509 } 510 511 static __inline__ __m128d __DEFAULT_FN_ATTRS 512 _mm_loadh_pd(__m128d __a, double const *__dp) 513 { 514 struct __mm_loadh_pd_struct { 515 double __u; 516 } __attribute__((__packed__, __may_alias__)); 517 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 518 return (__m128d){ __a[0], __u }; 519 } 520 521 static __inline__ __m128d __DEFAULT_FN_ATTRS 522 _mm_loadl_pd(__m128d __a, double const *__dp) 523 { 524 struct __mm_loadl_pd_struct { 525 double __u; 526 } __attribute__((__packed__, __may_alias__)); 527 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 528 return (__m128d){ __u, __a[1] }; 529 } 530 531 static __inline__ __m128d __DEFAULT_FN_ATTRS 532 _mm_undefined_pd() 533 { 534 return (__m128d)__builtin_ia32_undef128(); 535 } 536 537 static __inline__ __m128d __DEFAULT_FN_ATTRS 538 _mm_set_sd(double __w) 539 { 540 return (__m128d){ __w, 0 }; 541 } 542 543 static __inline__ __m128d __DEFAULT_FN_ATTRS 544 _mm_set1_pd(double __w) 545 { 546 return (__m128d){ __w, __w }; 547 } 548 549 static __inline__ __m128d __DEFAULT_FN_ATTRS 550 _mm_set_pd(double __w, double __x) 551 { 552 return (__m128d){ __x, __w }; 553 } 554 555 static __inline__ __m128d __DEFAULT_FN_ATTRS 556 _mm_setr_pd(double __w, double __x) 557 { 558 return (__m128d){ __w, __x }; 559 } 560 561 static __inline__ __m128d __DEFAULT_FN_ATTRS 562 _mm_setzero_pd(void) 563 { 564 return (__m128d){ 0, 0 }; 565 } 566 567 static __inline__ __m128d __DEFAULT_FN_ATTRS 568 _mm_move_sd(__m128d __a, __m128d __b) 569 { 570 return (__m128d){ __b[0], __a[1] }; 571 } 572 573 static __inline__ void __DEFAULT_FN_ATTRS 574 _mm_store_sd(double *__dp, __m128d __a) 575 { 576 struct __mm_store_sd_struct { 577 double __u; 578 } __attribute__((__packed__, __may_alias__)); 579 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 580 } 581 582 static __inline__ void __DEFAULT_FN_ATTRS 583 _mm_store1_pd(double *__dp, __m128d __a) 584 { 585 struct __mm_store1_pd_struct { 586 double __u[2]; 587 } __attribute__((__packed__, __may_alias__)); 588 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0]; 589 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0]; 590 } 591 592 static __inline__ void __DEFAULT_FN_ATTRS 593 _mm_store_pd(double *__dp, __m128d __a) 594 { 595 *(__m128d *)__dp = __a; 596 } 597 598 static __inline__ void __DEFAULT_FN_ATTRS 599 _mm_storeu_pd(double *__dp, __m128d __a) 600 { 601 __builtin_ia32_storeupd(__dp, __a); 602 } 603 604 static __inline__ void __DEFAULT_FN_ATTRS 605 _mm_storer_pd(double *__dp, __m128d __a) 606 { 607 __a = __builtin_shufflevector(__a, __a, 1, 0); 608 *(__m128d *)__dp = __a; 609 } 610 611 static __inline__ void __DEFAULT_FN_ATTRS 612 _mm_storeh_pd(double *__dp, __m128d __a) 613 { 614 struct __mm_storeh_pd_struct { 615 double __u; 616 } __attribute__((__packed__, __may_alias__)); 617 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 618 } 619 620 static __inline__ void __DEFAULT_FN_ATTRS 621 _mm_storel_pd(double *__dp, __m128d __a) 622 { 623 struct __mm_storeh_pd_struct { 624 double __u; 625 } __attribute__((__packed__, __may_alias__)); 626 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 627 } 628 629 static __inline__ __m128i __DEFAULT_FN_ATTRS 630 _mm_add_epi8(__m128i __a, __m128i __b) 631 { 632 return (__m128i)((__v16qi)__a + (__v16qi)__b); 633 } 634 635 static __inline__ __m128i __DEFAULT_FN_ATTRS 636 _mm_add_epi16(__m128i __a, __m128i __b) 637 { 638 return (__m128i)((__v8hi)__a + (__v8hi)__b); 639 } 640 641 static __inline__ __m128i __DEFAULT_FN_ATTRS 642 _mm_add_epi32(__m128i __a, __m128i __b) 643 { 644 return (__m128i)((__v4si)__a + (__v4si)__b); 645 } 646 647 static __inline__ __m64 __DEFAULT_FN_ATTRS 648 _mm_add_si64(__m64 __a, __m64 __b) 649 { 650 return (__m64)__builtin_ia32_paddq(__a, __b); 651 } 652 653 static __inline__ __m128i __DEFAULT_FN_ATTRS 654 _mm_add_epi64(__m128i __a, __m128i __b) 655 { 656 return __a + __b; 657 } 658 659 static __inline__ __m128i __DEFAULT_FN_ATTRS 660 _mm_adds_epi8(__m128i __a, __m128i __b) 661 { 662 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 663 } 664 665 static __inline__ __m128i __DEFAULT_FN_ATTRS 666 _mm_adds_epi16(__m128i __a, __m128i __b) 667 { 668 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 669 } 670 671 static __inline__ __m128i __DEFAULT_FN_ATTRS 672 _mm_adds_epu8(__m128i __a, __m128i __b) 673 { 674 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 675 } 676 677 static __inline__ __m128i __DEFAULT_FN_ATTRS 678 _mm_adds_epu16(__m128i __a, __m128i __b) 679 { 680 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 681 } 682 683 static __inline__ __m128i __DEFAULT_FN_ATTRS 684 _mm_avg_epu8(__m128i __a, __m128i __b) 685 { 686 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 687 } 688 689 static __inline__ __m128i __DEFAULT_FN_ATTRS 690 _mm_avg_epu16(__m128i __a, __m128i __b) 691 { 692 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 693 } 694 695 static __inline__ __m128i __DEFAULT_FN_ATTRS 696 _mm_madd_epi16(__m128i __a, __m128i __b) 697 { 698 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 699 } 700 701 static __inline__ __m128i __DEFAULT_FN_ATTRS 702 _mm_max_epi16(__m128i __a, __m128i __b) 703 { 704 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 705 } 706 707 static __inline__ __m128i __DEFAULT_FN_ATTRS 708 _mm_max_epu8(__m128i __a, __m128i __b) 709 { 710 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 711 } 712 713 static __inline__ __m128i __DEFAULT_FN_ATTRS 714 _mm_min_epi16(__m128i __a, __m128i __b) 715 { 716 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 717 } 718 719 static __inline__ __m128i __DEFAULT_FN_ATTRS 720 _mm_min_epu8(__m128i __a, __m128i __b) 721 { 722 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 723 } 724 725 static __inline__ __m128i __DEFAULT_FN_ATTRS 726 _mm_mulhi_epi16(__m128i __a, __m128i __b) 727 { 728 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 729 } 730 731 static __inline__ __m128i __DEFAULT_FN_ATTRS 732 _mm_mulhi_epu16(__m128i __a, __m128i __b) 733 { 734 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 735 } 736 737 static __inline__ __m128i __DEFAULT_FN_ATTRS 738 _mm_mullo_epi16(__m128i __a, __m128i __b) 739 { 740 return (__m128i)((__v8hi)__a * (__v8hi)__b); 741 } 742 743 static __inline__ __m64 __DEFAULT_FN_ATTRS 744 _mm_mul_su32(__m64 __a, __m64 __b) 745 { 746 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 747 } 748 749 static __inline__ __m128i __DEFAULT_FN_ATTRS 750 _mm_mul_epu32(__m128i __a, __m128i __b) 751 { 752 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 753 } 754 755 static __inline__ __m128i __DEFAULT_FN_ATTRS 756 _mm_sad_epu8(__m128i __a, __m128i __b) 757 { 758 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 759 } 760 761 static __inline__ __m128i __DEFAULT_FN_ATTRS 762 _mm_sub_epi8(__m128i __a, __m128i __b) 763 { 764 return (__m128i)((__v16qi)__a - (__v16qi)__b); 765 } 766 767 static __inline__ __m128i __DEFAULT_FN_ATTRS 768 _mm_sub_epi16(__m128i __a, __m128i __b) 769 { 770 return (__m128i)((__v8hi)__a - (__v8hi)__b); 771 } 772 773 static __inline__ __m128i __DEFAULT_FN_ATTRS 774 _mm_sub_epi32(__m128i __a, __m128i __b) 775 { 776 return (__m128i)((__v4si)__a - (__v4si)__b); 777 } 778 779 static __inline__ __m64 __DEFAULT_FN_ATTRS 780 _mm_sub_si64(__m64 __a, __m64 __b) 781 { 782 return (__m64)__builtin_ia32_psubq(__a, __b); 783 } 784 785 static __inline__ __m128i __DEFAULT_FN_ATTRS 786 _mm_sub_epi64(__m128i __a, __m128i __b) 787 { 788 return __a - __b; 789 } 790 791 static __inline__ __m128i __DEFAULT_FN_ATTRS 792 _mm_subs_epi8(__m128i __a, __m128i __b) 793 { 794 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 795 } 796 797 static __inline__ __m128i __DEFAULT_FN_ATTRS 798 _mm_subs_epi16(__m128i __a, __m128i __b) 799 { 800 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 801 } 802 803 static __inline__ __m128i __DEFAULT_FN_ATTRS 804 _mm_subs_epu8(__m128i __a, __m128i __b) 805 { 806 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 807 } 808 809 static __inline__ __m128i __DEFAULT_FN_ATTRS 810 _mm_subs_epu16(__m128i __a, __m128i __b) 811 { 812 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 813 } 814 815 static __inline__ __m128i __DEFAULT_FN_ATTRS 816 _mm_and_si128(__m128i __a, __m128i __b) 817 { 818 return __a & __b; 819 } 820 821 static __inline__ __m128i __DEFAULT_FN_ATTRS 822 _mm_andnot_si128(__m128i __a, __m128i __b) 823 { 824 return ~__a & __b; 825 } 826 827 static __inline__ __m128i __DEFAULT_FN_ATTRS 828 _mm_or_si128(__m128i __a, __m128i __b) 829 { 830 return __a | __b; 831 } 832 833 static __inline__ __m128i __DEFAULT_FN_ATTRS 834 _mm_xor_si128(__m128i __a, __m128i __b) 835 { 836 return __a ^ __b; 837 } 838 839 #define _mm_slli_si128(a, imm) __extension__ ({ \ 840 (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \ 841 (__v16qi)(__m128i)(a), \ 842 ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \ 843 ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \ 844 ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \ 845 ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \ 846 ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \ 847 ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \ 848 ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \ 849 ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \ 850 ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \ 851 ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \ 852 ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \ 853 ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \ 854 ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \ 855 ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \ 856 ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \ 857 ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); }) 858 859 #define _mm_bslli_si128(a, imm) \ 860 _mm_slli_si128((a), (imm)) 861 862 static __inline__ __m128i __DEFAULT_FN_ATTRS 863 _mm_slli_epi16(__m128i __a, int __count) 864 { 865 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 866 } 867 868 static __inline__ __m128i __DEFAULT_FN_ATTRS 869 _mm_sll_epi16(__m128i __a, __m128i __count) 870 { 871 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 872 } 873 874 static __inline__ __m128i __DEFAULT_FN_ATTRS 875 _mm_slli_epi32(__m128i __a, int __count) 876 { 877 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 878 } 879 880 static __inline__ __m128i __DEFAULT_FN_ATTRS 881 _mm_sll_epi32(__m128i __a, __m128i __count) 882 { 883 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 884 } 885 886 static __inline__ __m128i __DEFAULT_FN_ATTRS 887 _mm_slli_epi64(__m128i __a, int __count) 888 { 889 return __builtin_ia32_psllqi128(__a, __count); 890 } 891 892 static __inline__ __m128i __DEFAULT_FN_ATTRS 893 _mm_sll_epi64(__m128i __a, __m128i __count) 894 { 895 return __builtin_ia32_psllq128(__a, __count); 896 } 897 898 static __inline__ __m128i __DEFAULT_FN_ATTRS 899 _mm_srai_epi16(__m128i __a, int __count) 900 { 901 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 902 } 903 904 static __inline__ __m128i __DEFAULT_FN_ATTRS 905 _mm_sra_epi16(__m128i __a, __m128i __count) 906 { 907 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 908 } 909 910 static __inline__ __m128i __DEFAULT_FN_ATTRS 911 _mm_srai_epi32(__m128i __a, int __count) 912 { 913 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 914 } 915 916 static __inline__ __m128i __DEFAULT_FN_ATTRS 917 _mm_sra_epi32(__m128i __a, __m128i __count) 918 { 919 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 920 } 921 922 #define _mm_srli_si128(a, imm) __extension__ ({ \ 923 (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \ 924 (__v16qi)_mm_setzero_si128(), \ 925 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \ 926 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \ 927 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \ 928 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \ 929 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \ 930 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \ 931 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \ 932 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \ 933 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \ 934 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \ 935 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \ 936 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \ 937 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \ 938 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \ 939 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \ 940 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); }) 941 942 #define _mm_bsrli_si128(a, imm) \ 943 _mm_srli_si128((a), (imm)) 944 945 static __inline__ __m128i __DEFAULT_FN_ATTRS 946 _mm_srli_epi16(__m128i __a, int __count) 947 { 948 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 949 } 950 951 static __inline__ __m128i __DEFAULT_FN_ATTRS 952 _mm_srl_epi16(__m128i __a, __m128i __count) 953 { 954 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 955 } 956 957 static __inline__ __m128i __DEFAULT_FN_ATTRS 958 _mm_srli_epi32(__m128i __a, int __count) 959 { 960 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 961 } 962 963 static __inline__ __m128i __DEFAULT_FN_ATTRS 964 _mm_srl_epi32(__m128i __a, __m128i __count) 965 { 966 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 967 } 968 969 static __inline__ __m128i __DEFAULT_FN_ATTRS 970 _mm_srli_epi64(__m128i __a, int __count) 971 { 972 return __builtin_ia32_psrlqi128(__a, __count); 973 } 974 975 static __inline__ __m128i __DEFAULT_FN_ATTRS 976 _mm_srl_epi64(__m128i __a, __m128i __count) 977 { 978 return __builtin_ia32_psrlq128(__a, __count); 979 } 980 981 static __inline__ __m128i __DEFAULT_FN_ATTRS 982 _mm_cmpeq_epi8(__m128i __a, __m128i __b) 983 { 984 return (__m128i)((__v16qi)__a == (__v16qi)__b); 985 } 986 987 static __inline__ __m128i __DEFAULT_FN_ATTRS 988 _mm_cmpeq_epi16(__m128i __a, __m128i __b) 989 { 990 return (__m128i)((__v8hi)__a == (__v8hi)__b); 991 } 992 993 static __inline__ __m128i __DEFAULT_FN_ATTRS 994 _mm_cmpeq_epi32(__m128i __a, __m128i __b) 995 { 996 return (__m128i)((__v4si)__a == (__v4si)__b); 997 } 998 999 static __inline__ __m128i __DEFAULT_FN_ATTRS 1000 _mm_cmpgt_epi8(__m128i __a, __m128i __b) 1001 { 1002 /* This function always performs a signed comparison, but __v16qi is a char 1003 which may be signed or unsigned, so use __v16qs. */ 1004 return (__m128i)((__v16qs)__a > (__v16qs)__b); 1005 } 1006 1007 static __inline__ __m128i __DEFAULT_FN_ATTRS 1008 _mm_cmpgt_epi16(__m128i __a, __m128i __b) 1009 { 1010 return (__m128i)((__v8hi)__a > (__v8hi)__b); 1011 } 1012 1013 static __inline__ __m128i __DEFAULT_FN_ATTRS 1014 _mm_cmpgt_epi32(__m128i __a, __m128i __b) 1015 { 1016 return (__m128i)((__v4si)__a > (__v4si)__b); 1017 } 1018 1019 static __inline__ __m128i __DEFAULT_FN_ATTRS 1020 _mm_cmplt_epi8(__m128i __a, __m128i __b) 1021 { 1022 return _mm_cmpgt_epi8(__b, __a); 1023 } 1024 1025 static __inline__ __m128i __DEFAULT_FN_ATTRS 1026 _mm_cmplt_epi16(__m128i __a, __m128i __b) 1027 { 1028 return _mm_cmpgt_epi16(__b, __a); 1029 } 1030 1031 static __inline__ __m128i __DEFAULT_FN_ATTRS 1032 _mm_cmplt_epi32(__m128i __a, __m128i __b) 1033 { 1034 return _mm_cmpgt_epi32(__b, __a); 1035 } 1036 1037 #ifdef __x86_64__ 1038 static __inline__ __m128d __DEFAULT_FN_ATTRS 1039 _mm_cvtsi64_sd(__m128d __a, long long __b) 1040 { 1041 __a[0] = __b; 1042 return __a; 1043 } 1044 1045 static __inline__ long long __DEFAULT_FN_ATTRS 1046 _mm_cvtsd_si64(__m128d __a) 1047 { 1048 return __builtin_ia32_cvtsd2si64(__a); 1049 } 1050 1051 static __inline__ long long __DEFAULT_FN_ATTRS 1052 _mm_cvttsd_si64(__m128d __a) 1053 { 1054 return __a[0]; 1055 } 1056 #endif 1057 1058 static __inline__ __m128 __DEFAULT_FN_ATTRS 1059 _mm_cvtepi32_ps(__m128i __a) 1060 { 1061 return __builtin_ia32_cvtdq2ps((__v4si)__a); 1062 } 1063 1064 static __inline__ __m128i __DEFAULT_FN_ATTRS 1065 _mm_cvtps_epi32(__m128 __a) 1066 { 1067 return (__m128i)__builtin_ia32_cvtps2dq(__a); 1068 } 1069 1070 static __inline__ __m128i __DEFAULT_FN_ATTRS 1071 _mm_cvttps_epi32(__m128 __a) 1072 { 1073 return (__m128i)__builtin_ia32_cvttps2dq(__a); 1074 } 1075 1076 static __inline__ __m128i __DEFAULT_FN_ATTRS 1077 _mm_cvtsi32_si128(int __a) 1078 { 1079 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 1080 } 1081 1082 #ifdef __x86_64__ 1083 static __inline__ __m128i __DEFAULT_FN_ATTRS 1084 _mm_cvtsi64_si128(long long __a) 1085 { 1086 return (__m128i){ __a, 0 }; 1087 } 1088 #endif 1089 1090 static __inline__ int __DEFAULT_FN_ATTRS 1091 _mm_cvtsi128_si32(__m128i __a) 1092 { 1093 __v4si __b = (__v4si)__a; 1094 return __b[0]; 1095 } 1096 1097 #ifdef __x86_64__ 1098 static __inline__ long long __DEFAULT_FN_ATTRS 1099 _mm_cvtsi128_si64(__m128i __a) 1100 { 1101 return __a[0]; 1102 } 1103 #endif 1104 1105 static __inline__ __m128i __DEFAULT_FN_ATTRS 1106 _mm_load_si128(__m128i const *__p) 1107 { 1108 return *__p; 1109 } 1110 1111 static __inline__ __m128i __DEFAULT_FN_ATTRS 1112 _mm_loadu_si128(__m128i const *__p) 1113 { 1114 struct __loadu_si128 { 1115 __m128i __v; 1116 } __attribute__((__packed__, __may_alias__)); 1117 return ((struct __loadu_si128*)__p)->__v; 1118 } 1119 1120 static __inline__ __m128i __DEFAULT_FN_ATTRS 1121 _mm_loadl_epi64(__m128i const *__p) 1122 { 1123 struct __mm_loadl_epi64_struct { 1124 long long __u; 1125 } __attribute__((__packed__, __may_alias__)); 1126 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 1127 } 1128 1129 static __inline__ __m128i __DEFAULT_FN_ATTRS 1130 _mm_undefined_si128() 1131 { 1132 return (__m128i)__builtin_ia32_undef128(); 1133 } 1134 1135 static __inline__ __m128i __DEFAULT_FN_ATTRS 1136 _mm_set_epi64x(long long __q1, long long __q0) 1137 { 1138 return (__m128i){ __q0, __q1 }; 1139 } 1140 1141 static __inline__ __m128i __DEFAULT_FN_ATTRS 1142 _mm_set_epi64(__m64 __q1, __m64 __q0) 1143 { 1144 return (__m128i){ (long long)__q0, (long long)__q1 }; 1145 } 1146 1147 static __inline__ __m128i __DEFAULT_FN_ATTRS 1148 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 1149 { 1150 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 1151 } 1152 1153 static __inline__ __m128i __DEFAULT_FN_ATTRS 1154 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 1155 { 1156 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 1157 } 1158 1159 static __inline__ __m128i __DEFAULT_FN_ATTRS 1160 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 1161 { 1162 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 1163 } 1164 1165 static __inline__ __m128i __DEFAULT_FN_ATTRS 1166 _mm_set1_epi64x(long long __q) 1167 { 1168 return (__m128i){ __q, __q }; 1169 } 1170 1171 static __inline__ __m128i __DEFAULT_FN_ATTRS 1172 _mm_set1_epi64(__m64 __q) 1173 { 1174 return (__m128i){ (long long)__q, (long long)__q }; 1175 } 1176 1177 static __inline__ __m128i __DEFAULT_FN_ATTRS 1178 _mm_set1_epi32(int __i) 1179 { 1180 return (__m128i)(__v4si){ __i, __i, __i, __i }; 1181 } 1182 1183 static __inline__ __m128i __DEFAULT_FN_ATTRS 1184 _mm_set1_epi16(short __w) 1185 { 1186 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 1187 } 1188 1189 static __inline__ __m128i __DEFAULT_FN_ATTRS 1190 _mm_set1_epi8(char __b) 1191 { 1192 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 1193 } 1194 1195 static __inline__ __m128i __DEFAULT_FN_ATTRS 1196 _mm_setr_epi64(__m64 __q0, __m64 __q1) 1197 { 1198 return (__m128i){ (long long)__q0, (long long)__q1 }; 1199 } 1200 1201 static __inline__ __m128i __DEFAULT_FN_ATTRS 1202 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 1203 { 1204 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 1205 } 1206 1207 static __inline__ __m128i __DEFAULT_FN_ATTRS 1208 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 1209 { 1210 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 1211 } 1212 1213 static __inline__ __m128i __DEFAULT_FN_ATTRS 1214 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 1215 { 1216 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 1217 } 1218 1219 static __inline__ __m128i __DEFAULT_FN_ATTRS 1220 _mm_setzero_si128(void) 1221 { 1222 return (__m128i){ 0LL, 0LL }; 1223 } 1224 1225 static __inline__ void __DEFAULT_FN_ATTRS 1226 _mm_store_si128(__m128i *__p, __m128i __b) 1227 { 1228 *__p = __b; 1229 } 1230 1231 static __inline__ void __DEFAULT_FN_ATTRS 1232 _mm_storeu_si128(__m128i *__p, __m128i __b) 1233 { 1234 __builtin_ia32_storedqu((char *)__p, (__v16qi)__b); 1235 } 1236 1237 static __inline__ void __DEFAULT_FN_ATTRS 1238 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 1239 { 1240 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 1241 } 1242 1243 static __inline__ void __DEFAULT_FN_ATTRS 1244 _mm_storel_epi64(__m128i *__p, __m128i __a) 1245 { 1246 struct __mm_storel_epi64_struct { 1247 long long __u; 1248 } __attribute__((__packed__, __may_alias__)); 1249 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 1250 } 1251 1252 static __inline__ void __DEFAULT_FN_ATTRS 1253 _mm_stream_pd(double *__p, __m128d __a) 1254 { 1255 __builtin_ia32_movntpd(__p, __a); 1256 } 1257 1258 static __inline__ void __DEFAULT_FN_ATTRS 1259 _mm_stream_si128(__m128i *__p, __m128i __a) 1260 { 1261 __builtin_ia32_movntdq(__p, __a); 1262 } 1263 1264 static __inline__ void __DEFAULT_FN_ATTRS 1265 _mm_stream_si32(int *__p, int __a) 1266 { 1267 __builtin_ia32_movnti(__p, __a); 1268 } 1269 1270 #ifdef __x86_64__ 1271 static __inline__ void __DEFAULT_FN_ATTRS 1272 _mm_stream_si64(long long *__p, long long __a) 1273 { 1274 __builtin_ia32_movnti64(__p, __a); 1275 } 1276 #endif 1277 1278 static __inline__ void __DEFAULT_FN_ATTRS 1279 _mm_clflush(void const *__p) 1280 { 1281 __builtin_ia32_clflush(__p); 1282 } 1283 1284 static __inline__ void __DEFAULT_FN_ATTRS 1285 _mm_lfence(void) 1286 { 1287 __builtin_ia32_lfence(); 1288 } 1289 1290 static __inline__ void __DEFAULT_FN_ATTRS 1291 _mm_mfence(void) 1292 { 1293 __builtin_ia32_mfence(); 1294 } 1295 1296 static __inline__ __m128i __DEFAULT_FN_ATTRS 1297 _mm_packs_epi16(__m128i __a, __m128i __b) 1298 { 1299 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 1300 } 1301 1302 static __inline__ __m128i __DEFAULT_FN_ATTRS 1303 _mm_packs_epi32(__m128i __a, __m128i __b) 1304 { 1305 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 1306 } 1307 1308 static __inline__ __m128i __DEFAULT_FN_ATTRS 1309 _mm_packus_epi16(__m128i __a, __m128i __b) 1310 { 1311 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 1312 } 1313 1314 static __inline__ int __DEFAULT_FN_ATTRS 1315 _mm_extract_epi16(__m128i __a, int __imm) 1316 { 1317 __v8hi __b = (__v8hi)__a; 1318 return (unsigned short)__b[__imm & 7]; 1319 } 1320 1321 static __inline__ __m128i __DEFAULT_FN_ATTRS 1322 _mm_insert_epi16(__m128i __a, int __b, int __imm) 1323 { 1324 __v8hi __c = (__v8hi)__a; 1325 __c[__imm & 7] = __b; 1326 return (__m128i)__c; 1327 } 1328 1329 static __inline__ int __DEFAULT_FN_ATTRS 1330 _mm_movemask_epi8(__m128i __a) 1331 { 1332 return __builtin_ia32_pmovmskb128((__v16qi)__a); 1333 } 1334 1335 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 1336 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ 1337 (__v4si)_mm_setzero_si128(), \ 1338 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1339 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) 1340 1341 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 1342 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 1343 (__v8hi)_mm_setzero_si128(), \ 1344 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1345 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 1346 4, 5, 6, 7); }) 1347 1348 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 1349 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 1350 (__v8hi)_mm_setzero_si128(), \ 1351 0, 1, 2, 3, \ 1352 4 + (((imm) & 0x03) >> 0), \ 1353 4 + (((imm) & 0x0c) >> 2), \ 1354 4 + (((imm) & 0x30) >> 4), \ 1355 4 + (((imm) & 0xc0) >> 6)); }) 1356 1357 static __inline__ __m128i __DEFAULT_FN_ATTRS 1358 _mm_unpackhi_epi8(__m128i __a, __m128i __b) 1359 { 1360 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1361 } 1362 1363 static __inline__ __m128i __DEFAULT_FN_ATTRS 1364 _mm_unpackhi_epi16(__m128i __a, __m128i __b) 1365 { 1366 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1367 } 1368 1369 static __inline__ __m128i __DEFAULT_FN_ATTRS 1370 _mm_unpackhi_epi32(__m128i __a, __m128i __b) 1371 { 1372 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 1373 } 1374 1375 static __inline__ __m128i __DEFAULT_FN_ATTRS 1376 _mm_unpackhi_epi64(__m128i __a, __m128i __b) 1377 { 1378 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1); 1379 } 1380 1381 static __inline__ __m128i __DEFAULT_FN_ATTRS 1382 _mm_unpacklo_epi8(__m128i __a, __m128i __b) 1383 { 1384 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1385 } 1386 1387 static __inline__ __m128i __DEFAULT_FN_ATTRS 1388 _mm_unpacklo_epi16(__m128i __a, __m128i __b) 1389 { 1390 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1391 } 1392 1393 static __inline__ __m128i __DEFAULT_FN_ATTRS 1394 _mm_unpacklo_epi32(__m128i __a, __m128i __b) 1395 { 1396 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 1397 } 1398 1399 static __inline__ __m128i __DEFAULT_FN_ATTRS 1400 _mm_unpacklo_epi64(__m128i __a, __m128i __b) 1401 { 1402 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0); 1403 } 1404 1405 static __inline__ __m64 __DEFAULT_FN_ATTRS 1406 _mm_movepi64_pi64(__m128i __a) 1407 { 1408 return (__m64)__a[0]; 1409 } 1410 1411 static __inline__ __m128i __DEFAULT_FN_ATTRS 1412 _mm_movpi64_epi64(__m64 __a) 1413 { 1414 return (__m128i){ (long long)__a, 0 }; 1415 } 1416 1417 static __inline__ __m128i __DEFAULT_FN_ATTRS 1418 _mm_move_epi64(__m128i __a) 1419 { 1420 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2); 1421 } 1422 1423 static __inline__ __m128d __DEFAULT_FN_ATTRS 1424 _mm_unpackhi_pd(__m128d __a, __m128d __b) 1425 { 1426 return __builtin_shufflevector(__a, __b, 1, 2+1); 1427 } 1428 1429 static __inline__ __m128d __DEFAULT_FN_ATTRS 1430 _mm_unpacklo_pd(__m128d __a, __m128d __b) 1431 { 1432 return __builtin_shufflevector(__a, __b, 0, 2+0); 1433 } 1434 1435 static __inline__ int __DEFAULT_FN_ATTRS 1436 _mm_movemask_pd(__m128d __a) 1437 { 1438 return __builtin_ia32_movmskpd(__a); 1439 } 1440 1441 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 1442 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 1443 (i) & 1, (((i) & 2) >> 1) + 2); }) 1444 1445 static __inline__ __m128 __DEFAULT_FN_ATTRS 1446 _mm_castpd_ps(__m128d __a) 1447 { 1448 return (__m128)__a; 1449 } 1450 1451 static __inline__ __m128i __DEFAULT_FN_ATTRS 1452 _mm_castpd_si128(__m128d __a) 1453 { 1454 return (__m128i)__a; 1455 } 1456 1457 static __inline__ __m128d __DEFAULT_FN_ATTRS 1458 _mm_castps_pd(__m128 __a) 1459 { 1460 return (__m128d)__a; 1461 } 1462 1463 static __inline__ __m128i __DEFAULT_FN_ATTRS 1464 _mm_castps_si128(__m128 __a) 1465 { 1466 return (__m128i)__a; 1467 } 1468 1469 static __inline__ __m128 __DEFAULT_FN_ATTRS 1470 _mm_castsi128_ps(__m128i __a) 1471 { 1472 return (__m128)__a; 1473 } 1474 1475 static __inline__ __m128d __DEFAULT_FN_ATTRS 1476 _mm_castsi128_pd(__m128i __a) 1477 { 1478 return (__m128d)__a; 1479 } 1480 1481 static __inline__ void __DEFAULT_FN_ATTRS 1482 _mm_pause(void) 1483 { 1484 __builtin_ia32_pause(); 1485 } 1486 1487 #undef __DEFAULT_FN_ATTRS 1488 1489 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1490 1491 #endif /* __EMMINTRIN_H */ 1492