1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __EMMINTRIN_H 25 #define __EMMINTRIN_H 26 27 #include <xmmintrin.h> 28 29 typedef double __m128d __attribute__((__vector_size__(16))); 30 typedef long long __m128i __attribute__((__vector_size__(16))); 31 32 /* Type defines. */ 33 typedef double __v2df __attribute__ ((__vector_size__ (16))); 34 typedef long long __v2di __attribute__ ((__vector_size__ (16))); 35 typedef short __v8hi __attribute__((__vector_size__(16))); 36 typedef char __v16qi __attribute__((__vector_size__(16))); 37 38 /* Unsigned types */ 39 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); 40 typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 41 typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 42 43 /* We need an explicitly signed variant for char. Note that this shouldn't 44 * appear in the interface though. */ 45 typedef signed char __v16qs __attribute__((__vector_size__(16))); 46 47 #include <f16cintrin.h> 48 49 /* Define the default attributes for the functions in this file. */ 50 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 51 52 static __inline__ __m128d __DEFAULT_FN_ATTRS 53 _mm_add_sd(__m128d __a, __m128d __b) 54 { 55 __a[0] += __b[0]; 56 return __a; 57 } 58 59 static __inline__ __m128d __DEFAULT_FN_ATTRS 60 _mm_add_pd(__m128d __a, __m128d __b) 61 { 62 return (__m128d)((__v2df)__a + (__v2df)__b); 63 } 64 65 static __inline__ __m128d __DEFAULT_FN_ATTRS 66 _mm_sub_sd(__m128d __a, __m128d __b) 67 { 68 __a[0] -= __b[0]; 69 return __a; 70 } 71 72 static __inline__ __m128d __DEFAULT_FN_ATTRS 73 _mm_sub_pd(__m128d __a, __m128d __b) 74 { 75 return (__m128d)((__v2df)__a - (__v2df)__b); 76 } 77 78 static __inline__ __m128d __DEFAULT_FN_ATTRS 79 _mm_mul_sd(__m128d __a, __m128d __b) 80 { 81 __a[0] *= __b[0]; 82 return __a; 83 } 84 85 static __inline__ __m128d __DEFAULT_FN_ATTRS 86 _mm_mul_pd(__m128d __a, __m128d __b) 87 { 88 return (__m128d)((__v2df)__a * (__v2df)__b); 89 } 90 91 static __inline__ __m128d __DEFAULT_FN_ATTRS 92 _mm_div_sd(__m128d __a, __m128d __b) 93 { 94 __a[0] /= __b[0]; 95 return __a; 96 } 97 98 static __inline__ __m128d __DEFAULT_FN_ATTRS 99 _mm_div_pd(__m128d __a, __m128d __b) 100 { 101 return (__m128d)((__v2df)__a / (__v2df)__b); 102 } 103 104 static __inline__ __m128d __DEFAULT_FN_ATTRS 105 _mm_sqrt_sd(__m128d __a, __m128d __b) 106 { 107 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 108 return (__m128d) { __c[0], __a[1] }; 109 } 110 111 static __inline__ __m128d __DEFAULT_FN_ATTRS 112 _mm_sqrt_pd(__m128d __a) 113 { 114 return __builtin_ia32_sqrtpd((__v2df)__a); 115 } 116 117 static __inline__ __m128d __DEFAULT_FN_ATTRS 118 _mm_min_sd(__m128d __a, __m128d __b) 119 { 120 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 121 } 122 123 static __inline__ __m128d __DEFAULT_FN_ATTRS 124 _mm_min_pd(__m128d __a, __m128d __b) 125 { 126 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 127 } 128 129 static __inline__ __m128d __DEFAULT_FN_ATTRS 130 _mm_max_sd(__m128d __a, __m128d __b) 131 { 132 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 133 } 134 135 static __inline__ __m128d __DEFAULT_FN_ATTRS 136 _mm_max_pd(__m128d __a, __m128d __b) 137 { 138 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 139 } 140 141 static __inline__ __m128d __DEFAULT_FN_ATTRS 142 _mm_and_pd(__m128d __a, __m128d __b) 143 { 144 return (__m128d)((__v4su)__a & (__v4su)__b); 145 } 146 147 static __inline__ __m128d __DEFAULT_FN_ATTRS 148 _mm_andnot_pd(__m128d __a, __m128d __b) 149 { 150 return (__m128d)(~(__v4su)__a & (__v4su)__b); 151 } 152 153 static __inline__ __m128d __DEFAULT_FN_ATTRS 154 _mm_or_pd(__m128d __a, __m128d __b) 155 { 156 return (__m128d)((__v4su)__a | (__v4su)__b); 157 } 158 159 static __inline__ __m128d __DEFAULT_FN_ATTRS 160 _mm_xor_pd(__m128d __a, __m128d __b) 161 { 162 return (__m128d)((__v4su)__a ^ (__v4su)__b); 163 } 164 165 static __inline__ __m128d __DEFAULT_FN_ATTRS 166 _mm_cmpeq_pd(__m128d __a, __m128d __b) 167 { 168 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 169 } 170 171 static __inline__ __m128d __DEFAULT_FN_ATTRS 172 _mm_cmplt_pd(__m128d __a, __m128d __b) 173 { 174 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 175 } 176 177 static __inline__ __m128d __DEFAULT_FN_ATTRS 178 _mm_cmple_pd(__m128d __a, __m128d __b) 179 { 180 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 181 } 182 183 static __inline__ __m128d __DEFAULT_FN_ATTRS 184 _mm_cmpgt_pd(__m128d __a, __m128d __b) 185 { 186 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 187 } 188 189 static __inline__ __m128d __DEFAULT_FN_ATTRS 190 _mm_cmpge_pd(__m128d __a, __m128d __b) 191 { 192 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 193 } 194 195 static __inline__ __m128d __DEFAULT_FN_ATTRS 196 _mm_cmpord_pd(__m128d __a, __m128d __b) 197 { 198 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 199 } 200 201 static __inline__ __m128d __DEFAULT_FN_ATTRS 202 _mm_cmpunord_pd(__m128d __a, __m128d __b) 203 { 204 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 205 } 206 207 static __inline__ __m128d __DEFAULT_FN_ATTRS 208 _mm_cmpneq_pd(__m128d __a, __m128d __b) 209 { 210 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 211 } 212 213 static __inline__ __m128d __DEFAULT_FN_ATTRS 214 _mm_cmpnlt_pd(__m128d __a, __m128d __b) 215 { 216 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 217 } 218 219 static __inline__ __m128d __DEFAULT_FN_ATTRS 220 _mm_cmpnle_pd(__m128d __a, __m128d __b) 221 { 222 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 223 } 224 225 static __inline__ __m128d __DEFAULT_FN_ATTRS 226 _mm_cmpngt_pd(__m128d __a, __m128d __b) 227 { 228 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 229 } 230 231 static __inline__ __m128d __DEFAULT_FN_ATTRS 232 _mm_cmpnge_pd(__m128d __a, __m128d __b) 233 { 234 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 235 } 236 237 static __inline__ __m128d __DEFAULT_FN_ATTRS 238 _mm_cmpeq_sd(__m128d __a, __m128d __b) 239 { 240 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 241 } 242 243 static __inline__ __m128d __DEFAULT_FN_ATTRS 244 _mm_cmplt_sd(__m128d __a, __m128d __b) 245 { 246 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 247 } 248 249 static __inline__ __m128d __DEFAULT_FN_ATTRS 250 _mm_cmple_sd(__m128d __a, __m128d __b) 251 { 252 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 253 } 254 255 static __inline__ __m128d __DEFAULT_FN_ATTRS 256 _mm_cmpgt_sd(__m128d __a, __m128d __b) 257 { 258 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 259 return (__m128d) { __c[0], __a[1] }; 260 } 261 262 static __inline__ __m128d __DEFAULT_FN_ATTRS 263 _mm_cmpge_sd(__m128d __a, __m128d __b) 264 { 265 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 266 return (__m128d) { __c[0], __a[1] }; 267 } 268 269 static __inline__ __m128d __DEFAULT_FN_ATTRS 270 _mm_cmpord_sd(__m128d __a, __m128d __b) 271 { 272 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 273 } 274 275 static __inline__ __m128d __DEFAULT_FN_ATTRS 276 _mm_cmpunord_sd(__m128d __a, __m128d __b) 277 { 278 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 279 } 280 281 static __inline__ __m128d __DEFAULT_FN_ATTRS 282 _mm_cmpneq_sd(__m128d __a, __m128d __b) 283 { 284 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 285 } 286 287 static __inline__ __m128d __DEFAULT_FN_ATTRS 288 _mm_cmpnlt_sd(__m128d __a, __m128d __b) 289 { 290 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 291 } 292 293 static __inline__ __m128d __DEFAULT_FN_ATTRS 294 _mm_cmpnle_sd(__m128d __a, __m128d __b) 295 { 296 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 297 } 298 299 static __inline__ __m128d __DEFAULT_FN_ATTRS 300 _mm_cmpngt_sd(__m128d __a, __m128d __b) 301 { 302 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 303 return (__m128d) { __c[0], __a[1] }; 304 } 305 306 static __inline__ __m128d __DEFAULT_FN_ATTRS 307 _mm_cmpnge_sd(__m128d __a, __m128d __b) 308 { 309 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 310 return (__m128d) { __c[0], __a[1] }; 311 } 312 313 static __inline__ int __DEFAULT_FN_ATTRS 314 _mm_comieq_sd(__m128d __a, __m128d __b) 315 { 316 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 317 } 318 319 static __inline__ int __DEFAULT_FN_ATTRS 320 _mm_comilt_sd(__m128d __a, __m128d __b) 321 { 322 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 323 } 324 325 static __inline__ int __DEFAULT_FN_ATTRS 326 _mm_comile_sd(__m128d __a, __m128d __b) 327 { 328 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 329 } 330 331 static __inline__ int __DEFAULT_FN_ATTRS 332 _mm_comigt_sd(__m128d __a, __m128d __b) 333 { 334 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 335 } 336 337 static __inline__ int __DEFAULT_FN_ATTRS 338 _mm_comige_sd(__m128d __a, __m128d __b) 339 { 340 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 341 } 342 343 static __inline__ int __DEFAULT_FN_ATTRS 344 _mm_comineq_sd(__m128d __a, __m128d __b) 345 { 346 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 347 } 348 349 static __inline__ int __DEFAULT_FN_ATTRS 350 _mm_ucomieq_sd(__m128d __a, __m128d __b) 351 { 352 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 353 } 354 355 static __inline__ int __DEFAULT_FN_ATTRS 356 _mm_ucomilt_sd(__m128d __a, __m128d __b) 357 { 358 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 359 } 360 361 static __inline__ int __DEFAULT_FN_ATTRS 362 _mm_ucomile_sd(__m128d __a, __m128d __b) 363 { 364 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 365 } 366 367 static __inline__ int __DEFAULT_FN_ATTRS 368 _mm_ucomigt_sd(__m128d __a, __m128d __b) 369 { 370 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 371 } 372 373 static __inline__ int __DEFAULT_FN_ATTRS 374 _mm_ucomige_sd(__m128d __a, __m128d __b) 375 { 376 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 377 } 378 379 static __inline__ int __DEFAULT_FN_ATTRS 380 _mm_ucomineq_sd(__m128d __a, __m128d __b) 381 { 382 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 383 } 384 385 static __inline__ __m128 __DEFAULT_FN_ATTRS 386 _mm_cvtpd_ps(__m128d __a) 387 { 388 return __builtin_ia32_cvtpd2ps((__v2df)__a); 389 } 390 391 static __inline__ __m128d __DEFAULT_FN_ATTRS 392 _mm_cvtps_pd(__m128 __a) 393 { 394 return (__m128d) __builtin_convertvector( 395 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 396 } 397 398 static __inline__ __m128d __DEFAULT_FN_ATTRS 399 _mm_cvtepi32_pd(__m128i __a) 400 { 401 return (__m128d) __builtin_convertvector( 402 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 403 } 404 405 static __inline__ __m128i __DEFAULT_FN_ATTRS 406 _mm_cvtpd_epi32(__m128d __a) 407 { 408 return __builtin_ia32_cvtpd2dq((__v2df)__a); 409 } 410 411 static __inline__ int __DEFAULT_FN_ATTRS 412 _mm_cvtsd_si32(__m128d __a) 413 { 414 return __builtin_ia32_cvtsd2si((__v2df)__a); 415 } 416 417 static __inline__ __m128 __DEFAULT_FN_ATTRS 418 _mm_cvtsd_ss(__m128 __a, __m128d __b) 419 { 420 __a[0] = __b[0]; 421 return __a; 422 } 423 424 static __inline__ __m128d __DEFAULT_FN_ATTRS 425 _mm_cvtsi32_sd(__m128d __a, int __b) 426 { 427 __a[0] = __b; 428 return __a; 429 } 430 431 static __inline__ __m128d __DEFAULT_FN_ATTRS 432 _mm_cvtss_sd(__m128d __a, __m128 __b) 433 { 434 __a[0] = __b[0]; 435 return __a; 436 } 437 438 static __inline__ __m128i __DEFAULT_FN_ATTRS 439 _mm_cvttpd_epi32(__m128d __a) 440 { 441 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 442 } 443 444 static __inline__ int __DEFAULT_FN_ATTRS 445 _mm_cvttsd_si32(__m128d __a) 446 { 447 return __a[0]; 448 } 449 450 static __inline__ __m64 __DEFAULT_FN_ATTRS 451 _mm_cvtpd_pi32(__m128d __a) 452 { 453 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 454 } 455 456 static __inline__ __m64 __DEFAULT_FN_ATTRS 457 _mm_cvttpd_pi32(__m128d __a) 458 { 459 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 460 } 461 462 static __inline__ __m128d __DEFAULT_FN_ATTRS 463 _mm_cvtpi32_pd(__m64 __a) 464 { 465 return __builtin_ia32_cvtpi2pd((__v2si)__a); 466 } 467 468 static __inline__ double __DEFAULT_FN_ATTRS 469 _mm_cvtsd_f64(__m128d __a) 470 { 471 return __a[0]; 472 } 473 474 static __inline__ __m128d __DEFAULT_FN_ATTRS 475 _mm_load_pd(double const *__dp) 476 { 477 return *(__m128d*)__dp; 478 } 479 480 static __inline__ __m128d __DEFAULT_FN_ATTRS 481 _mm_load1_pd(double const *__dp) 482 { 483 struct __mm_load1_pd_struct { 484 double __u; 485 } __attribute__((__packed__, __may_alias__)); 486 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 487 return (__m128d){ __u, __u }; 488 } 489 490 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 491 492 static __inline__ __m128d __DEFAULT_FN_ATTRS 493 _mm_loadr_pd(double const *__dp) 494 { 495 __m128d __u = *(__m128d*)__dp; 496 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 497 } 498 499 static __inline__ __m128d __DEFAULT_FN_ATTRS 500 _mm_loadu_pd(double const *__dp) 501 { 502 struct __loadu_pd { 503 __m128d __v; 504 } __attribute__((__packed__, __may_alias__)); 505 return ((struct __loadu_pd*)__dp)->__v; 506 } 507 508 static __inline__ __m128i __DEFAULT_FN_ATTRS 509 _mm_loadu_si64(void const *__a) 510 { 511 struct __loadu_si64 { 512 long long __v; 513 } __attribute__((__packed__, __may_alias__)); 514 long long __u = ((struct __loadu_si64*)__a)->__v; 515 return (__m128i){__u, 0L}; 516 } 517 518 static __inline__ __m128d __DEFAULT_FN_ATTRS 519 _mm_load_sd(double const *__dp) 520 { 521 struct __mm_load_sd_struct { 522 double __u; 523 } __attribute__((__packed__, __may_alias__)); 524 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 525 return (__m128d){ __u, 0 }; 526 } 527 528 static __inline__ __m128d __DEFAULT_FN_ATTRS 529 _mm_loadh_pd(__m128d __a, double const *__dp) 530 { 531 struct __mm_loadh_pd_struct { 532 double __u; 533 } __attribute__((__packed__, __may_alias__)); 534 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 535 return (__m128d){ __a[0], __u }; 536 } 537 538 static __inline__ __m128d __DEFAULT_FN_ATTRS 539 _mm_loadl_pd(__m128d __a, double const *__dp) 540 { 541 struct __mm_loadl_pd_struct { 542 double __u; 543 } __attribute__((__packed__, __may_alias__)); 544 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 545 return (__m128d){ __u, __a[1] }; 546 } 547 548 static __inline__ __m128d __DEFAULT_FN_ATTRS 549 _mm_undefined_pd(void) 550 { 551 return (__m128d)__builtin_ia32_undef128(); 552 } 553 554 static __inline__ __m128d __DEFAULT_FN_ATTRS 555 _mm_set_sd(double __w) 556 { 557 return (__m128d){ __w, 0 }; 558 } 559 560 static __inline__ __m128d __DEFAULT_FN_ATTRS 561 _mm_set1_pd(double __w) 562 { 563 return (__m128d){ __w, __w }; 564 } 565 566 static __inline__ __m128d __DEFAULT_FN_ATTRS 567 _mm_set_pd(double __w, double __x) 568 { 569 return (__m128d){ __x, __w }; 570 } 571 572 static __inline__ __m128d __DEFAULT_FN_ATTRS 573 _mm_setr_pd(double __w, double __x) 574 { 575 return (__m128d){ __w, __x }; 576 } 577 578 static __inline__ __m128d __DEFAULT_FN_ATTRS 579 _mm_setzero_pd(void) 580 { 581 return (__m128d){ 0, 0 }; 582 } 583 584 static __inline__ __m128d __DEFAULT_FN_ATTRS 585 _mm_move_sd(__m128d __a, __m128d __b) 586 { 587 return (__m128d){ __b[0], __a[1] }; 588 } 589 590 static __inline__ void __DEFAULT_FN_ATTRS 591 _mm_store_sd(double *__dp, __m128d __a) 592 { 593 struct __mm_store_sd_struct { 594 double __u; 595 } __attribute__((__packed__, __may_alias__)); 596 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 597 } 598 599 static __inline__ void __DEFAULT_FN_ATTRS 600 _mm_store_pd(double *__dp, __m128d __a) 601 { 602 *(__m128d*)__dp = __a; 603 } 604 605 static __inline__ void __DEFAULT_FN_ATTRS 606 _mm_store1_pd(double *__dp, __m128d __a) 607 { 608 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 609 _mm_store_pd(__dp, __a); 610 } 611 612 static __inline__ void __DEFAULT_FN_ATTRS 613 _mm_store_pd1(double *__dp, __m128d __a) 614 { 615 return _mm_store1_pd(__dp, __a); 616 } 617 618 static __inline__ void __DEFAULT_FN_ATTRS 619 _mm_storeu_pd(double *__dp, __m128d __a) 620 { 621 struct __storeu_pd { 622 __m128d __v; 623 } __attribute__((__packed__, __may_alias__)); 624 ((struct __storeu_pd*)__dp)->__v = __a; 625 } 626 627 static __inline__ void __DEFAULT_FN_ATTRS 628 _mm_storer_pd(double *__dp, __m128d __a) 629 { 630 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 631 *(__m128d *)__dp = __a; 632 } 633 634 static __inline__ void __DEFAULT_FN_ATTRS 635 _mm_storeh_pd(double *__dp, __m128d __a) 636 { 637 struct __mm_storeh_pd_struct { 638 double __u; 639 } __attribute__((__packed__, __may_alias__)); 640 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 641 } 642 643 static __inline__ void __DEFAULT_FN_ATTRS 644 _mm_storel_pd(double *__dp, __m128d __a) 645 { 646 struct __mm_storeh_pd_struct { 647 double __u; 648 } __attribute__((__packed__, __may_alias__)); 649 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 650 } 651 652 static __inline__ __m128i __DEFAULT_FN_ATTRS 653 _mm_add_epi8(__m128i __a, __m128i __b) 654 { 655 return (__m128i)((__v16qu)__a + (__v16qu)__b); 656 } 657 658 static __inline__ __m128i __DEFAULT_FN_ATTRS 659 _mm_add_epi16(__m128i __a, __m128i __b) 660 { 661 return (__m128i)((__v8hu)__a + (__v8hu)__b); 662 } 663 664 static __inline__ __m128i __DEFAULT_FN_ATTRS 665 _mm_add_epi32(__m128i __a, __m128i __b) 666 { 667 return (__m128i)((__v4su)__a + (__v4su)__b); 668 } 669 670 static __inline__ __m64 __DEFAULT_FN_ATTRS 671 _mm_add_si64(__m64 __a, __m64 __b) 672 { 673 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 674 } 675 676 static __inline__ __m128i __DEFAULT_FN_ATTRS 677 _mm_add_epi64(__m128i __a, __m128i __b) 678 { 679 return (__m128i)((__v2du)__a + (__v2du)__b); 680 } 681 682 static __inline__ __m128i __DEFAULT_FN_ATTRS 683 _mm_adds_epi8(__m128i __a, __m128i __b) 684 { 685 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 686 } 687 688 static __inline__ __m128i __DEFAULT_FN_ATTRS 689 _mm_adds_epi16(__m128i __a, __m128i __b) 690 { 691 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 692 } 693 694 static __inline__ __m128i __DEFAULT_FN_ATTRS 695 _mm_adds_epu8(__m128i __a, __m128i __b) 696 { 697 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 698 } 699 700 static __inline__ __m128i __DEFAULT_FN_ATTRS 701 _mm_adds_epu16(__m128i __a, __m128i __b) 702 { 703 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 704 } 705 706 static __inline__ __m128i __DEFAULT_FN_ATTRS 707 _mm_avg_epu8(__m128i __a, __m128i __b) 708 { 709 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 710 } 711 712 static __inline__ __m128i __DEFAULT_FN_ATTRS 713 _mm_avg_epu16(__m128i __a, __m128i __b) 714 { 715 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 716 } 717 718 static __inline__ __m128i __DEFAULT_FN_ATTRS 719 _mm_madd_epi16(__m128i __a, __m128i __b) 720 { 721 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 722 } 723 724 static __inline__ __m128i __DEFAULT_FN_ATTRS 725 _mm_max_epi16(__m128i __a, __m128i __b) 726 { 727 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 728 } 729 730 static __inline__ __m128i __DEFAULT_FN_ATTRS 731 _mm_max_epu8(__m128i __a, __m128i __b) 732 { 733 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 734 } 735 736 static __inline__ __m128i __DEFAULT_FN_ATTRS 737 _mm_min_epi16(__m128i __a, __m128i __b) 738 { 739 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 740 } 741 742 static __inline__ __m128i __DEFAULT_FN_ATTRS 743 _mm_min_epu8(__m128i __a, __m128i __b) 744 { 745 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 746 } 747 748 static __inline__ __m128i __DEFAULT_FN_ATTRS 749 _mm_mulhi_epi16(__m128i __a, __m128i __b) 750 { 751 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 752 } 753 754 static __inline__ __m128i __DEFAULT_FN_ATTRS 755 _mm_mulhi_epu16(__m128i __a, __m128i __b) 756 { 757 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 758 } 759 760 /// \brief Multiplies the corresponding elements of two [8 x short] vectors and 761 /// returns a vector containing the low-order 16 bits of each 32-bit product 762 /// in the corresponding element. 763 /// 764 /// \headerfile <x86intrin.h> 765 /// 766 /// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction. 767 /// 768 /// \param __a 769 /// A 128-bit integer vector containing one of the source operands. 770 /// \param __b 771 /// A 128-bit integer vector containing one of the source operands. 772 /// \returns A 128-bit integer vector containing the products of both operands. 773 static __inline__ __m128i __DEFAULT_FN_ATTRS 774 _mm_mullo_epi16(__m128i __a, __m128i __b) 775 { 776 return (__m128i)((__v8hu)__a * (__v8hu)__b); 777 } 778 779 /// \brief Multiplies 32-bit unsigned integer values contained in the lower bits 780 /// of the two 64-bit integer vectors and returns the 64-bit unsigned 781 /// product. 782 /// 783 /// \headerfile <x86intrin.h> 784 /// 785 /// This intrinsic corresponds to the \c PMULUDQ instruction. 786 /// 787 /// \param __a 788 /// A 64-bit integer containing one of the source operands. 789 /// \param __b 790 /// A 64-bit integer containing one of the source operands. 791 /// \returns A 64-bit integer vector containing the product of both operands. 792 static __inline__ __m64 __DEFAULT_FN_ATTRS 793 _mm_mul_su32(__m64 __a, __m64 __b) 794 { 795 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 796 } 797 798 /// \brief Multiplies 32-bit unsigned integer values contained in the lower 799 /// bits of the corresponding elements of two [2 x i64] vectors, and returns 800 /// the 64-bit products in the corresponding elements of a [2 x i64] vector. 801 /// 802 /// \headerfile <x86intrin.h> 803 /// 804 /// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction. 805 /// 806 /// \param __a 807 /// A [2 x i64] vector containing one of the source operands. 808 /// \param __b 809 /// A [2 x i64] vector containing one of the source operands. 810 /// \returns A [2 x i64] vector containing the product of both operands. 811 static __inline__ __m128i __DEFAULT_FN_ATTRS 812 _mm_mul_epu32(__m128i __a, __m128i __b) 813 { 814 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 815 } 816 817 /// \brief Computes the absolute differences of corresponding 8-bit integer 818 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and 819 /// separately sums the second 8 absolute differences. Packss these two 820 /// unsigned 16-bit integer sums into the upper and lower elements of a 821 /// [2 x i64] vector. 822 /// 823 /// \headerfile <x86intrin.h> 824 /// 825 /// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction. 826 /// 827 /// \param __a 828 /// A 128-bit integer vector containing one of the source operands. 829 /// \param __b 830 /// A 128-bit integer vector containing one of the source operands. 831 /// \returns A [2 x i64] vector containing the sums of the sets of absolute 832 /// differences between both operands. 833 static __inline__ __m128i __DEFAULT_FN_ATTRS 834 _mm_sad_epu8(__m128i __a, __m128i __b) 835 { 836 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 837 } 838 839 /// \brief Subtracts the corresponding 8-bit integer values in the operands. 840 /// 841 /// \headerfile <x86intrin.h> 842 /// 843 /// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction. 844 /// 845 /// \param __a 846 /// A 128-bit integer vector containing the minuends. 847 /// \param __b 848 /// A 128-bit integer vector containing the subtrahends. 849 /// \returns A 128-bit integer vector containing the differences of the values 850 /// in the operands. 851 static __inline__ __m128i __DEFAULT_FN_ATTRS 852 _mm_sub_epi8(__m128i __a, __m128i __b) 853 { 854 return (__m128i)((__v16qu)__a - (__v16qu)__b); 855 } 856 857 /// \brief Subtracts the corresponding 16-bit integer values in the operands. 858 /// 859 /// \headerfile <x86intrin.h> 860 /// 861 /// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction. 862 /// 863 /// \param __a 864 /// A 128-bit integer vector containing the minuends. 865 /// \param __b 866 /// A 128-bit integer vector containing the subtrahends. 867 /// \returns A 128-bit integer vector containing the differences of the values 868 /// in the operands. 869 static __inline__ __m128i __DEFAULT_FN_ATTRS 870 _mm_sub_epi16(__m128i __a, __m128i __b) 871 { 872 return (__m128i)((__v8hu)__a - (__v8hu)__b); 873 } 874 875 /// \brief Subtracts the corresponding 32-bit integer values in the operands. 876 /// 877 /// \headerfile <x86intrin.h> 878 /// 879 /// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction. 880 /// 881 /// \param __a 882 /// A 128-bit integer vector containing the minuends. 883 /// \param __b 884 /// A 128-bit integer vector containing the subtrahends. 885 /// \returns A 128-bit integer vector containing the differences of the values 886 /// in the operands. 887 static __inline__ __m128i __DEFAULT_FN_ATTRS 888 _mm_sub_epi32(__m128i __a, __m128i __b) 889 { 890 return (__m128i)((__v4su)__a - (__v4su)__b); 891 } 892 893 /// \brief Subtracts signed or unsigned 64-bit integer values and writes the 894 /// difference to the corresponding bits in the destination. 895 /// 896 /// \headerfile <x86intrin.h> 897 /// 898 /// This intrinsic corresponds to the \c PSUBQ instruction. 899 /// 900 /// \param __a 901 /// A 64-bit integer vector containing the minuend. 902 /// \param __b 903 /// A 64-bit integer vector containing the subtrahend. 904 /// \returns A 64-bit integer vector containing the difference of the values in 905 /// the operands. 906 static __inline__ __m64 __DEFAULT_FN_ATTRS 907 _mm_sub_si64(__m64 __a, __m64 __b) 908 { 909 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 910 } 911 912 /// \brief Subtracts the corresponding elements of two [2 x i64] vectors. 913 /// 914 /// \headerfile <x86intrin.h> 915 /// 916 /// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction. 917 /// 918 /// \param __a 919 /// A 128-bit integer vector containing the minuends. 920 /// \param __b 921 /// A 128-bit integer vector containing the subtrahends. 922 /// \returns A 128-bit integer vector containing the differences of the values 923 /// in the operands. 924 static __inline__ __m128i __DEFAULT_FN_ATTRS 925 _mm_sub_epi64(__m128i __a, __m128i __b) 926 { 927 return (__m128i)((__v2du)__a - (__v2du)__b); 928 } 929 930 /// \brief Subtracts corresponding 8-bit signed integer values in the input and 931 /// returns the differences in the corresponding bytes in the destination. 932 /// Differences greater than 7Fh are saturated to 7Fh, and differences less 933 /// than 80h are saturated to 80h. 934 /// 935 /// \headerfile <x86intrin.h> 936 /// 937 /// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction. 938 /// 939 /// \param __a 940 /// A 128-bit integer vector containing the minuends. 941 /// \param __b 942 /// A 128-bit integer vector containing the subtrahends. 943 /// \returns A 128-bit integer vector containing the differences of the values 944 /// in the operands. 945 static __inline__ __m128i __DEFAULT_FN_ATTRS 946 _mm_subs_epi8(__m128i __a, __m128i __b) 947 { 948 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 949 } 950 951 /// \brief Subtracts corresponding 16-bit signed integer values in the input and 952 /// returns the differences in the corresponding bytes in the destination. 953 /// Differences greater than 7FFFh are saturated to 7FFFh, and values less 954 /// than 8000h are saturated to 8000h. 955 /// 956 /// \headerfile <x86intrin.h> 957 /// 958 /// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction. 959 /// 960 /// \param __a 961 /// A 128-bit integer vector containing the minuends. 962 /// \param __b 963 /// A 128-bit integer vector containing the subtrahends. 964 /// \returns A 128-bit integer vector containing the differences of the values 965 /// in the operands. 966 static __inline__ __m128i __DEFAULT_FN_ATTRS 967 _mm_subs_epi16(__m128i __a, __m128i __b) 968 { 969 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 970 } 971 972 /// \brief Subtracts corresponding 8-bit unsigned integer values in the input 973 /// and returns the differences in the corresponding bytes in the 974 /// destination. Differences less than 00h are saturated to 00h. 975 /// 976 /// \headerfile <x86intrin.h> 977 /// 978 /// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction. 979 /// 980 /// \param __a 981 /// A 128-bit integer vector containing the minuends. 982 /// \param __b 983 /// A 128-bit integer vector containing the subtrahends. 984 /// \returns A 128-bit integer vector containing the unsigned integer 985 /// differences of the values in the operands. 986 static __inline__ __m128i __DEFAULT_FN_ATTRS 987 _mm_subs_epu8(__m128i __a, __m128i __b) 988 { 989 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 990 } 991 992 /// \brief Subtracts corresponding 16-bit unsigned integer values in the input 993 /// and returns the differences in the corresponding bytes in the 994 /// destination. Differences less than 0000h are saturated to 0000h. 995 /// 996 /// \headerfile <x86intrin.h> 997 /// 998 /// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction. 999 /// 1000 /// \param __a 1001 /// A 128-bit integer vector containing the minuends. 1002 /// \param __b 1003 /// A 128-bit integer vector containing the subtrahends. 1004 /// \returns A 128-bit integer vector containing the unsigned integer 1005 /// differences of the values in the operands. 1006 static __inline__ __m128i __DEFAULT_FN_ATTRS 1007 _mm_subs_epu16(__m128i __a, __m128i __b) 1008 { 1009 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 1010 } 1011 1012 /// \brief Performs a bitwise AND of two 128-bit integer vectors. 1013 /// 1014 /// \headerfile <x86intrin.h> 1015 /// 1016 /// This intrinsic corresponds to the \c VPAND / PAND instruction. 1017 /// 1018 /// \param __a 1019 /// A 128-bit integer vector containing one of the source operands. 1020 /// \param __b 1021 /// A 128-bit integer vector containing one of the source operands. 1022 /// \returns A 128-bit integer vector containing the bitwise AND of the values 1023 /// in both operands. 1024 static __inline__ __m128i __DEFAULT_FN_ATTRS 1025 _mm_and_si128(__m128i __a, __m128i __b) 1026 { 1027 return (__m128i)((__v2du)__a & (__v2du)__b); 1028 } 1029 1030 /// \brief Performs a bitwise AND of two 128-bit integer vectors, using the 1031 /// one's complement of the values contained in the first source operand. 1032 /// 1033 /// \headerfile <x86intrin.h> 1034 /// 1035 /// This intrinsic corresponds to the \c VPANDN / PANDN instruction. 1036 /// 1037 /// \param __a 1038 /// A 128-bit vector containing the left source operand. The one's complement 1039 /// of this value is used in the bitwise AND. 1040 /// \param __b 1041 /// A 128-bit vector containing the right source operand. 1042 /// \returns A 128-bit integer vector containing the bitwise AND of the one's 1043 /// complement of the first operand and the values in the second operand. 1044 static __inline__ __m128i __DEFAULT_FN_ATTRS 1045 _mm_andnot_si128(__m128i __a, __m128i __b) 1046 { 1047 return (__m128i)(~(__v2du)__a & (__v2du)__b); 1048 } 1049 /// \brief Performs a bitwise OR of two 128-bit integer vectors. 1050 /// 1051 /// \headerfile <x86intrin.h> 1052 /// 1053 /// This intrinsic corresponds to the \c VPOR / POR instruction. 1054 /// 1055 /// \param __a 1056 /// A 128-bit integer vector containing one of the source operands. 1057 /// \param __b 1058 /// A 128-bit integer vector containing one of the source operands. 1059 /// \returns A 128-bit integer vector containing the bitwise OR of the values 1060 /// in both operands. 1061 static __inline__ __m128i __DEFAULT_FN_ATTRS 1062 _mm_or_si128(__m128i __a, __m128i __b) 1063 { 1064 return (__m128i)((__v2du)__a | (__v2du)__b); 1065 } 1066 1067 /// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors. 1068 /// 1069 /// \headerfile <x86intrin.h> 1070 /// 1071 /// This intrinsic corresponds to the \c VPXOR / PXOR instruction. 1072 /// 1073 /// \param __a 1074 /// A 128-bit integer vector containing one of the source operands. 1075 /// \param __b 1076 /// A 128-bit integer vector containing one of the source operands. 1077 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 1078 /// values in both operands. 1079 static __inline__ __m128i __DEFAULT_FN_ATTRS 1080 _mm_xor_si128(__m128i __a, __m128i __b) 1081 { 1082 return (__m128i)((__v2du)__a ^ (__v2du)__b); 1083 } 1084 1085 /// \brief Left-shifts the 128-bit integer vector operand by the specified 1086 /// number of bytes. Low-order bits are cleared. 1087 /// 1088 /// \headerfile <x86intrin.h> 1089 /// 1090 /// \code 1091 /// __m128i _mm_slli_si128(__m128i a, const int imm); 1092 /// \endcode 1093 /// 1094 /// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction. 1095 /// 1096 /// \param a 1097 /// A 128-bit integer vector containing the source operand. 1098 /// \param imm 1099 /// An immediate value specifying the number of bytes to left-shift 1100 /// operand a. 1101 /// \returns A 128-bit integer vector containing the left-shifted value. 1102 #define _mm_slli_si128(a, imm) __extension__ ({ \ 1103 (__m128i)__builtin_shufflevector( \ 1104 (__v16qi)_mm_setzero_si128(), \ 1105 (__v16qi)(__m128i)(a), \ 1106 ((char)(imm)&0xF0) ? 0 : 16 - (char)(imm), \ 1107 ((char)(imm)&0xF0) ? 1 : 17 - (char)(imm), \ 1108 ((char)(imm)&0xF0) ? 2 : 18 - (char)(imm), \ 1109 ((char)(imm)&0xF0) ? 3 : 19 - (char)(imm), \ 1110 ((char)(imm)&0xF0) ? 4 : 20 - (char)(imm), \ 1111 ((char)(imm)&0xF0) ? 5 : 21 - (char)(imm), \ 1112 ((char)(imm)&0xF0) ? 6 : 22 - (char)(imm), \ 1113 ((char)(imm)&0xF0) ? 7 : 23 - (char)(imm), \ 1114 ((char)(imm)&0xF0) ? 8 : 24 - (char)(imm), \ 1115 ((char)(imm)&0xF0) ? 9 : 25 - (char)(imm), \ 1116 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \ 1117 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \ 1118 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \ 1119 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \ 1120 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \ 1121 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); }) 1122 1123 #define _mm_bslli_si128(a, imm) \ 1124 _mm_slli_si128((a), (imm)) 1125 1126 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 1127 /// by the specified number of bits. Low-order bits are cleared. 1128 /// 1129 /// \headerfile <x86intrin.h> 1130 /// 1131 /// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. 1132 /// 1133 /// \param __a 1134 /// A 128-bit integer vector containing the source operand. 1135 /// \param __count 1136 /// An integer value specifying the number of bits to left-shift each value 1137 /// in operand __a. 1138 /// \returns A 128-bit integer vector containing the left-shifted values. 1139 static __inline__ __m128i __DEFAULT_FN_ATTRS 1140 _mm_slli_epi16(__m128i __a, int __count) 1141 { 1142 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 1143 } 1144 1145 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 1146 /// by the specified number of bits. Low-order bits are cleared. 1147 /// 1148 /// \headerfile <x86intrin.h> 1149 /// 1150 /// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. 1151 /// 1152 /// \param __a 1153 /// A 128-bit integer vector containing the source operand. 1154 /// \param __count 1155 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 1156 /// to left-shift each value in operand __a. 1157 /// \returns A 128-bit integer vector containing the left-shifted values. 1158 static __inline__ __m128i __DEFAULT_FN_ATTRS 1159 _mm_sll_epi16(__m128i __a, __m128i __count) 1160 { 1161 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 1162 } 1163 1164 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 1165 /// by the specified number of bits. Low-order bits are cleared. 1166 /// 1167 /// \headerfile <x86intrin.h> 1168 /// 1169 /// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. 1170 /// 1171 /// \param __a 1172 /// A 128-bit integer vector containing the source operand. 1173 /// \param __count 1174 /// An integer value specifying the number of bits to left-shift each value 1175 /// in operand __a. 1176 /// \returns A 128-bit integer vector containing the left-shifted values. 1177 static __inline__ __m128i __DEFAULT_FN_ATTRS 1178 _mm_slli_epi32(__m128i __a, int __count) 1179 { 1180 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 1181 } 1182 1183 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 1184 /// by the specified number of bits. Low-order bits are cleared. 1185 /// 1186 /// \headerfile <x86intrin.h> 1187 /// 1188 /// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. 1189 /// 1190 /// \param __a 1191 /// A 128-bit integer vector containing the source operand. 1192 /// \param __count 1193 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 1194 /// to left-shift each value in operand __a. 1195 /// \returns A 128-bit integer vector containing the left-shifted values. 1196 static __inline__ __m128i __DEFAULT_FN_ATTRS 1197 _mm_sll_epi32(__m128i __a, __m128i __count) 1198 { 1199 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 1200 } 1201 1202 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 1203 /// by the specified number of bits. Low-order bits are cleared. 1204 /// 1205 /// \headerfile <x86intrin.h> 1206 /// 1207 /// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. 1208 /// 1209 /// \param __a 1210 /// A 128-bit integer vector containing the source operand. 1211 /// \param __count 1212 /// An integer value specifying the number of bits to left-shift each value 1213 /// in operand __a. 1214 /// \returns A 128-bit integer vector containing the left-shifted values. 1215 static __inline__ __m128i __DEFAULT_FN_ATTRS 1216 _mm_slli_epi64(__m128i __a, int __count) 1217 { 1218 return __builtin_ia32_psllqi128((__v2di)__a, __count); 1219 } 1220 1221 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 1222 /// by the specified number of bits. Low-order bits are cleared. 1223 /// 1224 /// \headerfile <x86intrin.h> 1225 /// 1226 /// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. 1227 /// 1228 /// \param __a 1229 /// A 128-bit integer vector containing the source operand. 1230 /// \param __count 1231 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 1232 /// to left-shift each value in operand __a. 1233 /// \returns A 128-bit integer vector containing the left-shifted values. 1234 static __inline__ __m128i __DEFAULT_FN_ATTRS 1235 _mm_sll_epi64(__m128i __a, __m128i __count) 1236 { 1237 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 1238 } 1239 1240 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 1241 /// by the specified number of bits. High-order bits are filled with the sign 1242 /// bit of the initial value. 1243 /// 1244 /// \headerfile <x86intrin.h> 1245 /// 1246 /// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. 1247 /// 1248 /// \param __a 1249 /// A 128-bit integer vector containing the source operand. 1250 /// \param __count 1251 /// An integer value specifying the number of bits to right-shift each value 1252 /// in operand __a. 1253 /// \returns A 128-bit integer vector containing the right-shifted values. 1254 static __inline__ __m128i __DEFAULT_FN_ATTRS 1255 _mm_srai_epi16(__m128i __a, int __count) 1256 { 1257 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 1258 } 1259 1260 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 1261 /// by the specified number of bits. High-order bits are filled with the sign 1262 /// bit of the initial value. 1263 /// 1264 /// \headerfile <x86intrin.h> 1265 /// 1266 /// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. 1267 /// 1268 /// \param __a 1269 /// A 128-bit integer vector containing the source operand. 1270 /// \param __count 1271 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 1272 /// to right-shift each value in operand __a. 1273 /// \returns A 128-bit integer vector containing the right-shifted values. 1274 static __inline__ __m128i __DEFAULT_FN_ATTRS 1275 _mm_sra_epi16(__m128i __a, __m128i __count) 1276 { 1277 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 1278 } 1279 1280 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 1281 /// by the specified number of bits. High-order bits are filled with the sign 1282 /// bit of the initial value. 1283 /// 1284 /// \headerfile <x86intrin.h> 1285 /// 1286 /// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. 1287 /// 1288 /// \param __a 1289 /// A 128-bit integer vector containing the source operand. 1290 /// \param __count 1291 /// An integer value specifying the number of bits to right-shift each value 1292 /// in operand __a. 1293 /// \returns A 128-bit integer vector containing the right-shifted values. 1294 static __inline__ __m128i __DEFAULT_FN_ATTRS 1295 _mm_srai_epi32(__m128i __a, int __count) 1296 { 1297 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 1298 } 1299 1300 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 1301 /// by the specified number of bits. High-order bits are filled with the sign 1302 /// bit of the initial value. 1303 /// 1304 /// \headerfile <x86intrin.h> 1305 /// 1306 /// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. 1307 /// 1308 /// \param __a 1309 /// A 128-bit integer vector containing the source operand. 1310 /// \param __count 1311 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 1312 /// to right-shift each value in operand __a. 1313 /// \returns A 128-bit integer vector containing the right-shifted values. 1314 static __inline__ __m128i __DEFAULT_FN_ATTRS 1315 _mm_sra_epi32(__m128i __a, __m128i __count) 1316 { 1317 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 1318 } 1319 1320 /// \brief Right-shifts the 128-bit integer vector operand by the specified 1321 /// number of bytes. High-order bits are cleared. 1322 /// 1323 /// \headerfile <x86intrin.h> 1324 /// 1325 /// \code 1326 /// __m128i _mm_srli_si128(__m128i a, const int imm); 1327 /// \endcode 1328 /// 1329 /// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction. 1330 /// 1331 /// \param a 1332 /// A 128-bit integer vector containing the source operand. 1333 /// \param imm 1334 /// An immediate value specifying the number of bytes to right-shift operand 1335 /// a. 1336 /// \returns A 128-bit integer vector containing the right-shifted value. 1337 #define _mm_srli_si128(a, imm) __extension__ ({ \ 1338 (__m128i)__builtin_shufflevector( \ 1339 (__v16qi)(__m128i)(a), \ 1340 (__v16qi)_mm_setzero_si128(), \ 1341 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0, \ 1342 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1, \ 1343 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2, \ 1344 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3, \ 1345 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4, \ 1346 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5, \ 1347 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6, \ 1348 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7, \ 1349 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8, \ 1350 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9, \ 1351 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \ 1352 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \ 1353 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \ 1354 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \ 1355 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \ 1356 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); }) 1357 1358 #define _mm_bsrli_si128(a, imm) \ 1359 _mm_srli_si128((a), (imm)) 1360 1361 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 1362 /// operand by the specified number of bits. High-order bits are cleared. 1363 /// 1364 /// \headerfile <x86intrin.h> 1365 /// 1366 /// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. 1367 /// 1368 /// \param __a 1369 /// A 128-bit integer vector containing the source operand. 1370 /// \param __count 1371 /// An integer value specifying the number of bits to right-shift each value 1372 /// in operand __a. 1373 /// \returns A 128-bit integer vector containing the right-shifted values. 1374 static __inline__ __m128i __DEFAULT_FN_ATTRS 1375 _mm_srli_epi16(__m128i __a, int __count) 1376 { 1377 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 1378 } 1379 1380 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 1381 /// operand by the specified number of bits. High-order bits are cleared. 1382 /// 1383 /// \headerfile <x86intrin.h> 1384 /// 1385 /// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. 1386 /// 1387 /// \param __a 1388 /// A 128-bit integer vector containing the source operand. 1389 /// \param __count 1390 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 1391 /// to right-shift each value in operand __a. 1392 /// \returns A 128-bit integer vector containing the right-shifted values. 1393 static __inline__ __m128i __DEFAULT_FN_ATTRS 1394 _mm_srl_epi16(__m128i __a, __m128i __count) 1395 { 1396 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 1397 } 1398 1399 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 1400 /// operand by the specified number of bits. High-order bits are cleared. 1401 /// 1402 /// \headerfile <x86intrin.h> 1403 /// 1404 /// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. 1405 /// 1406 /// \param __a 1407 /// A 128-bit integer vector containing the source operand. 1408 /// \param __count 1409 /// An integer value specifying the number of bits to right-shift each value 1410 /// in operand __a. 1411 /// \returns A 128-bit integer vector containing the right-shifted values. 1412 static __inline__ __m128i __DEFAULT_FN_ATTRS 1413 _mm_srli_epi32(__m128i __a, int __count) 1414 { 1415 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 1416 } 1417 1418 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 1419 /// operand by the specified number of bits. High-order bits are cleared. 1420 /// 1421 /// \headerfile <x86intrin.h> 1422 /// 1423 /// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. 1424 /// 1425 /// \param __a 1426 /// A 128-bit integer vector containing the source operand. 1427 /// \param __count 1428 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 1429 /// to right-shift each value in operand __a. 1430 /// \returns A 128-bit integer vector containing the right-shifted values. 1431 static __inline__ __m128i __DEFAULT_FN_ATTRS 1432 _mm_srl_epi32(__m128i __a, __m128i __count) 1433 { 1434 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 1435 } 1436 1437 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 1438 /// operand by the specified number of bits. High-order bits are cleared. 1439 /// 1440 /// \headerfile <x86intrin.h> 1441 /// 1442 /// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. 1443 /// 1444 /// \param __a 1445 /// A 128-bit integer vector containing the source operand. 1446 /// \param __count 1447 /// An integer value specifying the number of bits to right-shift each value 1448 /// in operand __a. 1449 /// \returns A 128-bit integer vector containing the right-shifted values. 1450 static __inline__ __m128i __DEFAULT_FN_ATTRS 1451 _mm_srli_epi64(__m128i __a, int __count) 1452 { 1453 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 1454 } 1455 1456 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 1457 /// operand by the specified number of bits. High-order bits are cleared. 1458 /// 1459 /// \headerfile <x86intrin.h> 1460 /// 1461 /// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. 1462 /// 1463 /// \param __a 1464 /// A 128-bit integer vector containing the source operand. 1465 /// \param __count 1466 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 1467 /// to right-shift each value in operand __a. 1468 /// \returns A 128-bit integer vector containing the right-shifted values. 1469 static __inline__ __m128i __DEFAULT_FN_ATTRS 1470 _mm_srl_epi64(__m128i __a, __m128i __count) 1471 { 1472 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 1473 } 1474 1475 /// \brief Compares each of the corresponding 8-bit values of the 128-bit 1476 /// integer vectors for equality. Each comparison yields 0h for false, FFh 1477 /// for true. 1478 /// 1479 /// \headerfile <x86intrin.h> 1480 /// 1481 /// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction. 1482 /// 1483 /// \param __a 1484 /// A 128-bit integer vector. 1485 /// \param __b 1486 /// A 128-bit integer vector. 1487 /// \returns A 128-bit integer vector containing the comparison results. 1488 static __inline__ __m128i __DEFAULT_FN_ATTRS 1489 _mm_cmpeq_epi8(__m128i __a, __m128i __b) 1490 { 1491 return (__m128i)((__v16qi)__a == (__v16qi)__b); 1492 } 1493 1494 /// \brief Compares each of the corresponding 16-bit values of the 128-bit 1495 /// integer vectors for equality. Each comparison yields 0h for false, FFFFh 1496 /// for true. 1497 /// 1498 /// \headerfile <x86intrin.h> 1499 /// 1500 /// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction. 1501 /// 1502 /// \param __a 1503 /// A 128-bit integer vector. 1504 /// \param __b 1505 /// A 128-bit integer vector. 1506 /// \returns A 128-bit integer vector containing the comparison results. 1507 static __inline__ __m128i __DEFAULT_FN_ATTRS 1508 _mm_cmpeq_epi16(__m128i __a, __m128i __b) 1509 { 1510 return (__m128i)((__v8hi)__a == (__v8hi)__b); 1511 } 1512 1513 /// \brief Compares each of the corresponding 32-bit values of the 128-bit 1514 /// integer vectors for equality. Each comparison yields 0h for false, 1515 /// FFFFFFFFh for true. 1516 /// 1517 /// \headerfile <x86intrin.h> 1518 /// 1519 /// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction. 1520 /// 1521 /// \param __a 1522 /// A 128-bit integer vector. 1523 /// \param __b 1524 /// A 128-bit integer vector. 1525 /// \returns A 128-bit integer vector containing the comparison results. 1526 static __inline__ __m128i __DEFAULT_FN_ATTRS 1527 _mm_cmpeq_epi32(__m128i __a, __m128i __b) 1528 { 1529 return (__m128i)((__v4si)__a == (__v4si)__b); 1530 } 1531 1532 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 1533 /// integer vectors to determine if the values in the first operand are 1534 /// greater than those in the second operand. Each comparison yields 0h for 1535 /// false, FFh for true. 1536 /// 1537 /// \headerfile <x86intrin.h> 1538 /// 1539 /// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. 1540 /// 1541 /// \param __a 1542 /// A 128-bit integer vector. 1543 /// \param __b 1544 /// A 128-bit integer vector. 1545 /// \returns A 128-bit integer vector containing the comparison results. 1546 static __inline__ __m128i __DEFAULT_FN_ATTRS 1547 _mm_cmpgt_epi8(__m128i __a, __m128i __b) 1548 { 1549 /* This function always performs a signed comparison, but __v16qi is a char 1550 which may be signed or unsigned, so use __v16qs. */ 1551 return (__m128i)((__v16qs)__a > (__v16qs)__b); 1552 } 1553 1554 /// \brief Compares each of the corresponding signed 16-bit values of the 1555 /// 128-bit integer vectors to determine if the values in the first operand 1556 /// are greater than those in the second operand. Each comparison yields 0h 1557 /// for false, FFFFh for true. 1558 /// 1559 /// \headerfile <x86intrin.h> 1560 /// 1561 /// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. 1562 /// 1563 /// \param __a 1564 /// A 128-bit integer vector. 1565 /// \param __b 1566 /// A 128-bit integer vector. 1567 /// \returns A 128-bit integer vector containing the comparison results. 1568 static __inline__ __m128i __DEFAULT_FN_ATTRS 1569 _mm_cmpgt_epi16(__m128i __a, __m128i __b) 1570 { 1571 return (__m128i)((__v8hi)__a > (__v8hi)__b); 1572 } 1573 1574 /// \brief Compares each of the corresponding signed 32-bit values of the 1575 /// 128-bit integer vectors to determine if the values in the first operand 1576 /// are greater than those in the second operand. Each comparison yields 0h 1577 /// for false, FFFFFFFFh for true. 1578 /// 1579 /// \headerfile <x86intrin.h> 1580 /// 1581 /// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. 1582 /// 1583 /// \param __a 1584 /// A 128-bit integer vector. 1585 /// \param __b 1586 /// A 128-bit integer vector. 1587 /// \returns A 128-bit integer vector containing the comparison results. 1588 static __inline__ __m128i __DEFAULT_FN_ATTRS 1589 _mm_cmpgt_epi32(__m128i __a, __m128i __b) 1590 { 1591 return (__m128i)((__v4si)__a > (__v4si)__b); 1592 } 1593 1594 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 1595 /// integer vectors to determine if the values in the first operand are less 1596 /// than those in the second operand. Each comparison yields 0h for false, 1597 /// FFh for true. 1598 /// 1599 /// \headerfile <x86intrin.h> 1600 /// 1601 /// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. 1602 /// 1603 /// \param __a 1604 /// A 128-bit integer vector. 1605 /// \param __b 1606 /// A 128-bit integer vector. 1607 /// \returns A 128-bit integer vector containing the comparison results. 1608 static __inline__ __m128i __DEFAULT_FN_ATTRS 1609 _mm_cmplt_epi8(__m128i __a, __m128i __b) 1610 { 1611 return _mm_cmpgt_epi8(__b, __a); 1612 } 1613 1614 /// \brief Compares each of the corresponding signed 16-bit values of the 1615 /// 128-bit integer vectors to determine if the values in the first operand 1616 /// are less than those in the second operand. Each comparison yields 0h for 1617 /// false, FFFFh for true. 1618 /// 1619 /// \headerfile <x86intrin.h> 1620 /// 1621 /// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. 1622 /// 1623 /// \param __a 1624 /// A 128-bit integer vector. 1625 /// \param __b 1626 /// A 128-bit integer vector. 1627 /// \returns A 128-bit integer vector containing the comparison results. 1628 static __inline__ __m128i __DEFAULT_FN_ATTRS 1629 _mm_cmplt_epi16(__m128i __a, __m128i __b) 1630 { 1631 return _mm_cmpgt_epi16(__b, __a); 1632 } 1633 1634 /// \brief Compares each of the corresponding signed 32-bit values of the 1635 /// 128-bit integer vectors to determine if the values in the first operand 1636 /// are less than those in the second operand. Each comparison yields 0h for 1637 /// false, FFFFFFFFh for true. 1638 /// 1639 /// \headerfile <x86intrin.h> 1640 /// 1641 /// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. 1642 /// 1643 /// \param __a 1644 /// A 128-bit integer vector. 1645 /// \param __b 1646 /// A 128-bit integer vector. 1647 /// \returns A 128-bit integer vector containing the comparison results. 1648 static __inline__ __m128i __DEFAULT_FN_ATTRS 1649 _mm_cmplt_epi32(__m128i __a, __m128i __b) 1650 { 1651 return _mm_cmpgt_epi32(__b, __a); 1652 } 1653 1654 #ifdef __x86_64__ 1655 /// \brief Converts a 64-bit signed integer value from the second operand into a 1656 /// double-precision value and returns it in the lower element of a [2 x 1657 /// double] vector; the upper element of the returned vector is copied from 1658 /// the upper element of the first operand. 1659 /// 1660 /// \headerfile <x86intrin.h> 1661 /// 1662 /// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction. 1663 /// 1664 /// \param __a 1665 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 1666 /// copied to the upper 64 bits of the destination. 1667 /// \param __b 1668 /// A 64-bit signed integer operand containing the value to be converted. 1669 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 1670 /// converted value of the second operand. The upper 64 bits are copied from 1671 /// the upper 64 bits of the first operand. 1672 static __inline__ __m128d __DEFAULT_FN_ATTRS 1673 _mm_cvtsi64_sd(__m128d __a, long long __b) 1674 { 1675 __a[0] = __b; 1676 return __a; 1677 } 1678 1679 /// \brief Converts the first (lower) element of a vector of [2 x double] into a 1680 /// 64-bit signed integer value, according to the current rounding mode. 1681 /// 1682 /// \headerfile <x86intrin.h> 1683 /// 1684 /// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction. 1685 /// 1686 /// \param __a 1687 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1688 /// conversion. 1689 /// \returns A 64-bit signed integer containing the converted value. 1690 static __inline__ long long __DEFAULT_FN_ATTRS 1691 _mm_cvtsd_si64(__m128d __a) 1692 { 1693 return __builtin_ia32_cvtsd2si64((__v2df)__a); 1694 } 1695 1696 /// \brief Converts the first (lower) element of a vector of [2 x double] into a 1697 /// 64-bit signed integer value, truncating the result when it is inexact. 1698 /// 1699 /// \headerfile <x86intrin.h> 1700 /// 1701 /// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction. 1702 /// 1703 /// \param __a 1704 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1705 /// conversion. 1706 /// \returns A 64-bit signed integer containing the converted value. 1707 static __inline__ long long __DEFAULT_FN_ATTRS 1708 _mm_cvttsd_si64(__m128d __a) 1709 { 1710 return __a[0]; 1711 } 1712 #endif 1713 1714 /// \brief Converts a vector of [4 x i32] into a vector of [4 x float]. 1715 /// 1716 /// \headerfile <x86intrin.h> 1717 /// 1718 /// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction. 1719 /// 1720 /// \param __a 1721 /// A 128-bit integer vector. 1722 /// \returns A 128-bit vector of [4 x float] containing the converted values. 1723 static __inline__ __m128 __DEFAULT_FN_ATTRS 1724 _mm_cvtepi32_ps(__m128i __a) 1725 { 1726 return __builtin_ia32_cvtdq2ps((__v4si)__a); 1727 } 1728 1729 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32]. 1730 /// 1731 /// \headerfile <x86intrin.h> 1732 /// 1733 /// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction. 1734 /// 1735 /// \param __a 1736 /// A 128-bit vector of [4 x float]. 1737 /// \returns A 128-bit integer vector of [4 x i32] containing the converted 1738 /// values. 1739 static __inline__ __m128i __DEFAULT_FN_ATTRS 1740 _mm_cvtps_epi32(__m128 __a) 1741 { 1742 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 1743 } 1744 1745 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32], 1746 /// truncating the result when it is inexact. 1747 /// 1748 /// \headerfile <x86intrin.h> 1749 /// 1750 /// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction. 1751 /// 1752 /// \param __a 1753 /// A 128-bit vector of [4 x float]. 1754 /// \returns A 128-bit vector of [4 x i32] containing the converted values. 1755 static __inline__ __m128i __DEFAULT_FN_ATTRS 1756 _mm_cvttps_epi32(__m128 __a) 1757 { 1758 return (__m128i)__builtin_convertvector((__v4sf)__a, __v4si); 1759 } 1760 1761 /// \brief Returns a vector of [4 x i32] where the lowest element is the input 1762 /// operand and the remaining elements are zero. 1763 /// 1764 /// \headerfile <x86intrin.h> 1765 /// 1766 /// This intrinsic corresponds to the \c VMOVD / MOVD instruction. 1767 /// 1768 /// \param __a 1769 /// A 32-bit signed integer operand. 1770 /// \returns A 128-bit vector of [4 x i32]. 1771 static __inline__ __m128i __DEFAULT_FN_ATTRS 1772 _mm_cvtsi32_si128(int __a) 1773 { 1774 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 1775 } 1776 1777 #ifdef __x86_64__ 1778 /// \brief Returns a vector of [2 x i64] where the lower element is the input 1779 /// operand and the upper element is zero. 1780 /// 1781 /// \headerfile <x86intrin.h> 1782 /// 1783 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1784 /// 1785 /// \param __a 1786 /// A 64-bit signed integer operand containing the value to be converted. 1787 /// \returns A 128-bit vector of [2 x i64] containing the converted value. 1788 static __inline__ __m128i __DEFAULT_FN_ATTRS 1789 _mm_cvtsi64_si128(long long __a) 1790 { 1791 return (__m128i){ __a, 0 }; 1792 } 1793 #endif 1794 1795 /// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a 1796 /// 32-bit signed integer value. 1797 /// 1798 /// \headerfile <x86intrin.h> 1799 /// 1800 /// This intrinsic corresponds to the \c VMOVD / MOVD instruction. 1801 /// 1802 /// \param __a 1803 /// A vector of [4 x i32]. The least significant 32 bits are moved to the 1804 /// destination. 1805 /// \returns A 32-bit signed integer containing the moved value. 1806 static __inline__ int __DEFAULT_FN_ATTRS 1807 _mm_cvtsi128_si32(__m128i __a) 1808 { 1809 __v4si __b = (__v4si)__a; 1810 return __b[0]; 1811 } 1812 1813 #ifdef __x86_64__ 1814 /// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a 1815 /// 64-bit signed integer value. 1816 /// 1817 /// \headerfile <x86intrin.h> 1818 /// 1819 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1820 /// 1821 /// \param __a 1822 /// A vector of [2 x i64]. The least significant 64 bits are moved to the 1823 /// destination. 1824 /// \returns A 64-bit signed integer containing the moved value. 1825 static __inline__ long long __DEFAULT_FN_ATTRS 1826 _mm_cvtsi128_si64(__m128i __a) 1827 { 1828 return __a[0]; 1829 } 1830 #endif 1831 1832 /// \brief Moves packed integer values from an aligned 128-bit memory location 1833 /// to elements in a 128-bit integer vector. 1834 /// 1835 /// \headerfile <x86intrin.h> 1836 /// 1837 /// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction. 1838 /// 1839 /// \param __p 1840 /// An aligned pointer to a memory location containing integer values. 1841 /// \returns A 128-bit integer vector containing the moved values. 1842 static __inline__ __m128i __DEFAULT_FN_ATTRS 1843 _mm_load_si128(__m128i const *__p) 1844 { 1845 return *__p; 1846 } 1847 1848 /// \brief Moves packed integer values from an unaligned 128-bit memory location 1849 /// to elements in a 128-bit integer vector. 1850 /// 1851 /// \headerfile <x86intrin.h> 1852 /// 1853 /// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction. 1854 /// 1855 /// \param __p 1856 /// A pointer to a memory location containing integer values. 1857 /// \returns A 128-bit integer vector containing the moved values. 1858 static __inline__ __m128i __DEFAULT_FN_ATTRS 1859 _mm_loadu_si128(__m128i const *__p) 1860 { 1861 struct __loadu_si128 { 1862 __m128i __v; 1863 } __attribute__((__packed__, __may_alias__)); 1864 return ((struct __loadu_si128*)__p)->__v; 1865 } 1866 1867 /// \brief Returns a vector of [2 x i64] where the lower element is taken from 1868 /// the lower element of the operand, and the upper element is zero. 1869 /// 1870 /// \headerfile <x86intrin.h> 1871 /// 1872 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1873 /// 1874 /// \param __p 1875 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 1876 /// the destination. 1877 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 1878 /// moved value. The higher order bits are cleared. 1879 static __inline__ __m128i __DEFAULT_FN_ATTRS 1880 _mm_loadl_epi64(__m128i const *__p) 1881 { 1882 struct __mm_loadl_epi64_struct { 1883 long long __u; 1884 } __attribute__((__packed__, __may_alias__)); 1885 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 1886 } 1887 1888 /// \brief Generates a 128-bit vector of [4 x i32] with unspecified content. 1889 /// This could be used as an argument to another intrinsic function where the 1890 /// argument is required but the value is not actually used. 1891 /// 1892 /// \headerfile <x86intrin.h> 1893 /// 1894 /// This intrinsic has no corresponding instruction. 1895 /// 1896 /// \returns A 128-bit vector of [4 x i32] with unspecified content. 1897 static __inline__ __m128i __DEFAULT_FN_ATTRS 1898 _mm_undefined_si128(void) 1899 { 1900 return (__m128i)__builtin_ia32_undef128(); 1901 } 1902 1903 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 1904 /// the specified 64-bit integer values. 1905 /// 1906 /// \headerfile <x86intrin.h> 1907 /// 1908 /// This intrinsic is a utility function and does not correspond to a specific 1909 /// instruction. 1910 /// 1911 /// \param __q1 1912 /// A 64-bit integer value used to initialize the upper 64 bits of the 1913 /// destination vector of [2 x i64]. 1914 /// \param __q0 1915 /// A 64-bit integer value used to initialize the lower 64 bits of the 1916 /// destination vector of [2 x i64]. 1917 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 1918 /// provided in the operands. 1919 static __inline__ __m128i __DEFAULT_FN_ATTRS 1920 _mm_set_epi64x(long long __q1, long long __q0) 1921 { 1922 return (__m128i){ __q0, __q1 }; 1923 } 1924 1925 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 1926 /// the specified 64-bit integer values. 1927 /// 1928 /// \headerfile <x86intrin.h> 1929 /// 1930 /// This intrinsic is a utility function and does not correspond to a specific 1931 /// instruction. 1932 /// 1933 /// \param __q1 1934 /// A 64-bit integer value used to initialize the upper 64 bits of the 1935 /// destination vector of [2 x i64]. 1936 /// \param __q0 1937 /// A 64-bit integer value used to initialize the lower 64 bits of the 1938 /// destination vector of [2 x i64]. 1939 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 1940 /// provided in the operands. 1941 static __inline__ __m128i __DEFAULT_FN_ATTRS 1942 _mm_set_epi64(__m64 __q1, __m64 __q0) 1943 { 1944 return (__m128i){ (long long)__q0, (long long)__q1 }; 1945 } 1946 1947 /// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 1948 /// the specified 32-bit integer values. 1949 /// 1950 /// \headerfile <x86intrin.h> 1951 /// 1952 /// This intrinsic is a utility function and does not correspond to a specific 1953 /// instruction. 1954 /// 1955 /// \param __i3 1956 /// A 32-bit integer value used to initialize bits [127:96] of the 1957 /// destination vector. 1958 /// \param __i2 1959 /// A 32-bit integer value used to initialize bits [95:64] of the destination 1960 /// vector. 1961 /// \param __i1 1962 /// A 32-bit integer value used to initialize bits [63:32] of the destination 1963 /// vector. 1964 /// \param __i0 1965 /// A 32-bit integer value used to initialize bits [31:0] of the destination 1966 /// vector. 1967 /// \returns An initialized 128-bit vector of [4 x i32] containing the values 1968 /// provided in the operands. 1969 static __inline__ __m128i __DEFAULT_FN_ATTRS 1970 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 1971 { 1972 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 1973 } 1974 1975 /// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 1976 /// the specified 16-bit integer values. 1977 /// 1978 /// \headerfile <x86intrin.h> 1979 /// 1980 /// This intrinsic is a utility function and does not correspond to a specific 1981 /// instruction. 1982 /// 1983 /// \param __w7 1984 /// A 16-bit integer value used to initialize bits [127:112] of the 1985 /// destination vector. 1986 /// \param __w6 1987 /// A 16-bit integer value used to initialize bits [111:96] of the 1988 /// destination vector. 1989 /// \param __w5 1990 /// A 16-bit integer value used to initialize bits [95:80] of the destination 1991 /// vector. 1992 /// \param __w4 1993 /// A 16-bit integer value used to initialize bits [79:64] of the destination 1994 /// vector. 1995 /// \param __w3 1996 /// A 16-bit integer value used to initialize bits [63:48] of the destination 1997 /// vector. 1998 /// \param __w2 1999 /// A 16-bit integer value used to initialize bits [47:32] of the destination 2000 /// vector. 2001 /// \param __w1 2002 /// A 16-bit integer value used to initialize bits [31:16] of the destination 2003 /// vector. 2004 /// \param __w0 2005 /// A 16-bit integer value used to initialize bits [15:0] of the destination 2006 /// vector. 2007 /// \returns An initialized 128-bit vector of [8 x i16] containing the values 2008 /// provided in the operands. 2009 static __inline__ __m128i __DEFAULT_FN_ATTRS 2010 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 2011 { 2012 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 2013 } 2014 2015 /// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 2016 /// the specified 8-bit integer values. 2017 /// 2018 /// \headerfile <x86intrin.h> 2019 /// 2020 /// This intrinsic is a utility function and does not correspond to a specific 2021 /// instruction. 2022 /// 2023 /// \param __b15 2024 /// Initializes bits [127:120] of the destination vector. 2025 /// \param __b14 2026 /// Initializes bits [119:112] of the destination vector. 2027 /// \param __b13 2028 /// Initializes bits [111:104] of the destination vector. 2029 /// \param __b12 2030 /// Initializes bits [103:96] of the destination vector. 2031 /// \param __b11 2032 /// Initializes bits [95:88] of the destination vector. 2033 /// \param __b10 2034 /// Initializes bits [87:80] of the destination vector. 2035 /// \param __b9 2036 /// Initializes bits [79:72] of the destination vector. 2037 /// \param __b8 2038 /// Initializes bits [71:64] of the destination vector. 2039 /// \param __b7 2040 /// Initializes bits [63:56] of the destination vector. 2041 /// \param __b6 2042 /// Initializes bits [55:48] of the destination vector. 2043 /// \param __b5 2044 /// Initializes bits [47:40] of the destination vector. 2045 /// \param __b4 2046 /// Initializes bits [39:32] of the destination vector. 2047 /// \param __b3 2048 /// Initializes bits [31:24] of the destination vector. 2049 /// \param __b2 2050 /// Initializes bits [23:16] of the destination vector. 2051 /// \param __b1 2052 /// Initializes bits [15:8] of the destination vector. 2053 /// \param __b0 2054 /// Initializes bits [7:0] of the destination vector. 2055 /// \returns An initialized 128-bit vector of [16 x i8] containing the values 2056 /// provided in the operands. 2057 static __inline__ __m128i __DEFAULT_FN_ATTRS 2058 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 2059 { 2060 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 2061 } 2062 2063 /// \brief Initializes both values in a 128-bit integer vector with the 2064 /// specified 64-bit integer value. 2065 /// 2066 /// \headerfile <x86intrin.h> 2067 /// 2068 /// This intrinsic is a utility function and does not correspond to a specific 2069 /// instruction. 2070 /// 2071 /// \param __q 2072 /// Integer value used to initialize the elements of the destination integer 2073 /// vector. 2074 /// \returns An initialized 128-bit integer vector of [2 x i64] with both 2075 /// elements containing the value provided in the operand. 2076 static __inline__ __m128i __DEFAULT_FN_ATTRS 2077 _mm_set1_epi64x(long long __q) 2078 { 2079 return (__m128i){ __q, __q }; 2080 } 2081 2082 /// \brief Initializes both values in a 128-bit vector of [2 x i64] with the 2083 /// specified 64-bit value. 2084 /// 2085 /// \headerfile <x86intrin.h> 2086 /// 2087 /// This intrinsic is a utility function and does not correspond to a specific 2088 /// instruction. 2089 /// 2090 /// \param __q 2091 /// A 64-bit value used to initialize the elements of the destination integer 2092 /// vector. 2093 /// \returns An initialized 128-bit vector of [2 x i64] with all elements 2094 /// containing the value provided in the operand. 2095 static __inline__ __m128i __DEFAULT_FN_ATTRS 2096 _mm_set1_epi64(__m64 __q) 2097 { 2098 return (__m128i){ (long long)__q, (long long)__q }; 2099 } 2100 2101 /// \brief Initializes all values in a 128-bit vector of [4 x i32] with the 2102 /// specified 32-bit value. 2103 /// 2104 /// \headerfile <x86intrin.h> 2105 /// 2106 /// This intrinsic is a utility function and does not correspond to a specific 2107 /// instruction. 2108 /// 2109 /// \param __i 2110 /// A 32-bit value used to initialize the elements of the destination integer 2111 /// vector. 2112 /// \returns An initialized 128-bit vector of [4 x i32] with all elements 2113 /// containing the value provided in the operand. 2114 static __inline__ __m128i __DEFAULT_FN_ATTRS 2115 _mm_set1_epi32(int __i) 2116 { 2117 return (__m128i)(__v4si){ __i, __i, __i, __i }; 2118 } 2119 2120 /// \brief Initializes all values in a 128-bit vector of [8 x i16] with the 2121 /// specified 16-bit value. 2122 /// 2123 /// \headerfile <x86intrin.h> 2124 /// 2125 /// This intrinsic is a utility function and does not correspond to a specific 2126 /// instruction. 2127 /// 2128 /// \param __w 2129 /// A 16-bit value used to initialize the elements of the destination integer 2130 /// vector. 2131 /// \returns An initialized 128-bit vector of [8 x i16] with all elements 2132 /// containing the value provided in the operand. 2133 static __inline__ __m128i __DEFAULT_FN_ATTRS 2134 _mm_set1_epi16(short __w) 2135 { 2136 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 2137 } 2138 2139 /// \brief Initializes all values in a 128-bit vector of [16 x i8] with the 2140 /// specified 8-bit value. 2141 /// 2142 /// \headerfile <x86intrin.h> 2143 /// 2144 /// This intrinsic is a utility function and does not correspond to a specific 2145 /// instruction. 2146 /// 2147 /// \param __b 2148 /// An 8-bit value used to initialize the elements of the destination integer 2149 /// vector. 2150 /// \returns An initialized 128-bit vector of [16 x i8] with all elements 2151 /// containing the value provided in the operand. 2152 static __inline__ __m128i __DEFAULT_FN_ATTRS 2153 _mm_set1_epi8(char __b) 2154 { 2155 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 2156 } 2157 2158 static __inline__ __m128i __DEFAULT_FN_ATTRS 2159 _mm_setr_epi64(__m64 __q0, __m64 __q1) 2160 { 2161 return (__m128i){ (long long)__q0, (long long)__q1 }; 2162 } 2163 2164 static __inline__ __m128i __DEFAULT_FN_ATTRS 2165 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 2166 { 2167 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 2168 } 2169 2170 static __inline__ __m128i __DEFAULT_FN_ATTRS 2171 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 2172 { 2173 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 2174 } 2175 2176 static __inline__ __m128i __DEFAULT_FN_ATTRS 2177 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 2178 { 2179 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 2180 } 2181 2182 static __inline__ __m128i __DEFAULT_FN_ATTRS 2183 _mm_setzero_si128(void) 2184 { 2185 return (__m128i){ 0LL, 0LL }; 2186 } 2187 2188 static __inline__ void __DEFAULT_FN_ATTRS 2189 _mm_store_si128(__m128i *__p, __m128i __b) 2190 { 2191 *__p = __b; 2192 } 2193 2194 static __inline__ void __DEFAULT_FN_ATTRS 2195 _mm_storeu_si128(__m128i *__p, __m128i __b) 2196 { 2197 struct __storeu_si128 { 2198 __m128i __v; 2199 } __attribute__((__packed__, __may_alias__)); 2200 ((struct __storeu_si128*)__p)->__v = __b; 2201 } 2202 2203 static __inline__ void __DEFAULT_FN_ATTRS 2204 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 2205 { 2206 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 2207 } 2208 2209 static __inline__ void __DEFAULT_FN_ATTRS 2210 _mm_storel_epi64(__m128i *__p, __m128i __a) 2211 { 2212 struct __mm_storel_epi64_struct { 2213 long long __u; 2214 } __attribute__((__packed__, __may_alias__)); 2215 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 2216 } 2217 2218 static __inline__ void __DEFAULT_FN_ATTRS 2219 _mm_stream_pd(double *__p, __m128d __a) 2220 { 2221 __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); 2222 } 2223 2224 static __inline__ void __DEFAULT_FN_ATTRS 2225 _mm_stream_si128(__m128i *__p, __m128i __a) 2226 { 2227 __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); 2228 } 2229 2230 static __inline__ void __DEFAULT_FN_ATTRS 2231 _mm_stream_si32(int *__p, int __a) 2232 { 2233 __builtin_ia32_movnti(__p, __a); 2234 } 2235 2236 #ifdef __x86_64__ 2237 static __inline__ void __DEFAULT_FN_ATTRS 2238 _mm_stream_si64(long long *__p, long long __a) 2239 { 2240 __builtin_ia32_movnti64(__p, __a); 2241 } 2242 #endif 2243 2244 static __inline__ void __DEFAULT_FN_ATTRS 2245 _mm_clflush(void const *__p) 2246 { 2247 __builtin_ia32_clflush(__p); 2248 } 2249 2250 static __inline__ void __DEFAULT_FN_ATTRS 2251 _mm_lfence(void) 2252 { 2253 __builtin_ia32_lfence(); 2254 } 2255 2256 static __inline__ void __DEFAULT_FN_ATTRS 2257 _mm_mfence(void) 2258 { 2259 __builtin_ia32_mfence(); 2260 } 2261 2262 static __inline__ __m128i __DEFAULT_FN_ATTRS 2263 _mm_packs_epi16(__m128i __a, __m128i __b) 2264 { 2265 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 2266 } 2267 2268 static __inline__ __m128i __DEFAULT_FN_ATTRS 2269 _mm_packs_epi32(__m128i __a, __m128i __b) 2270 { 2271 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 2272 } 2273 2274 static __inline__ __m128i __DEFAULT_FN_ATTRS 2275 _mm_packus_epi16(__m128i __a, __m128i __b) 2276 { 2277 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 2278 } 2279 2280 static __inline__ int __DEFAULT_FN_ATTRS 2281 _mm_extract_epi16(__m128i __a, int __imm) 2282 { 2283 __v8hi __b = (__v8hi)__a; 2284 return (unsigned short)__b[__imm & 7]; 2285 } 2286 2287 static __inline__ __m128i __DEFAULT_FN_ATTRS 2288 _mm_insert_epi16(__m128i __a, int __b, int __imm) 2289 { 2290 __v8hi __c = (__v8hi)__a; 2291 __c[__imm & 7] = __b; 2292 return (__m128i)__c; 2293 } 2294 2295 static __inline__ int __DEFAULT_FN_ATTRS 2296 _mm_movemask_epi8(__m128i __a) 2297 { 2298 return __builtin_ia32_pmovmskb128((__v16qi)__a); 2299 } 2300 2301 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 2302 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ 2303 (__v4si)_mm_undefined_si128(), \ 2304 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ 2305 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); }) 2306 2307 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 2308 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 2309 (__v8hi)_mm_undefined_si128(), \ 2310 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ 2311 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \ 2312 4, 5, 6, 7); }) 2313 2314 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 2315 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 2316 (__v8hi)_mm_undefined_si128(), \ 2317 0, 1, 2, 3, \ 2318 4 + (((imm) >> 0) & 0x3), \ 2319 4 + (((imm) >> 2) & 0x3), \ 2320 4 + (((imm) >> 4) & 0x3), \ 2321 4 + (((imm) >> 6) & 0x3)); }) 2322 2323 static __inline__ __m128i __DEFAULT_FN_ATTRS 2324 _mm_unpackhi_epi8(__m128i __a, __m128i __b) 2325 { 2326 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 2327 } 2328 2329 static __inline__ __m128i __DEFAULT_FN_ATTRS 2330 _mm_unpackhi_epi16(__m128i __a, __m128i __b) 2331 { 2332 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 2333 } 2334 2335 static __inline__ __m128i __DEFAULT_FN_ATTRS 2336 _mm_unpackhi_epi32(__m128i __a, __m128i __b) 2337 { 2338 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 2339 } 2340 2341 static __inline__ __m128i __DEFAULT_FN_ATTRS 2342 _mm_unpackhi_epi64(__m128i __a, __m128i __b) 2343 { 2344 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); 2345 } 2346 2347 static __inline__ __m128i __DEFAULT_FN_ATTRS 2348 _mm_unpacklo_epi8(__m128i __a, __m128i __b) 2349 { 2350 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 2351 } 2352 2353 static __inline__ __m128i __DEFAULT_FN_ATTRS 2354 _mm_unpacklo_epi16(__m128i __a, __m128i __b) 2355 { 2356 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 2357 } 2358 2359 static __inline__ __m128i __DEFAULT_FN_ATTRS 2360 _mm_unpacklo_epi32(__m128i __a, __m128i __b) 2361 { 2362 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 2363 } 2364 2365 static __inline__ __m128i __DEFAULT_FN_ATTRS 2366 _mm_unpacklo_epi64(__m128i __a, __m128i __b) 2367 { 2368 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); 2369 } 2370 2371 static __inline__ __m64 __DEFAULT_FN_ATTRS 2372 _mm_movepi64_pi64(__m128i __a) 2373 { 2374 return (__m64)__a[0]; 2375 } 2376 2377 static __inline__ __m128i __DEFAULT_FN_ATTRS 2378 _mm_movpi64_epi64(__m64 __a) 2379 { 2380 return (__m128i){ (long long)__a, 0 }; 2381 } 2382 2383 static __inline__ __m128i __DEFAULT_FN_ATTRS 2384 _mm_move_epi64(__m128i __a) 2385 { 2386 return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2); 2387 } 2388 2389 static __inline__ __m128d __DEFAULT_FN_ATTRS 2390 _mm_unpackhi_pd(__m128d __a, __m128d __b) 2391 { 2392 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); 2393 } 2394 2395 static __inline__ __m128d __DEFAULT_FN_ATTRS 2396 _mm_unpacklo_pd(__m128d __a, __m128d __b) 2397 { 2398 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); 2399 } 2400 2401 static __inline__ int __DEFAULT_FN_ATTRS 2402 _mm_movemask_pd(__m128d __a) 2403 { 2404 return __builtin_ia32_movmskpd((__v2df)__a); 2405 } 2406 2407 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 2408 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 2409 0 + (((i) >> 0) & 0x1), \ 2410 2 + (((i) >> 1) & 0x1)); }) 2411 2412 static __inline__ __m128 __DEFAULT_FN_ATTRS 2413 _mm_castpd_ps(__m128d __a) 2414 { 2415 return (__m128)__a; 2416 } 2417 2418 static __inline__ __m128i __DEFAULT_FN_ATTRS 2419 _mm_castpd_si128(__m128d __a) 2420 { 2421 return (__m128i)__a; 2422 } 2423 2424 static __inline__ __m128d __DEFAULT_FN_ATTRS 2425 _mm_castps_pd(__m128 __a) 2426 { 2427 return (__m128d)__a; 2428 } 2429 2430 static __inline__ __m128i __DEFAULT_FN_ATTRS 2431 _mm_castps_si128(__m128 __a) 2432 { 2433 return (__m128i)__a; 2434 } 2435 2436 static __inline__ __m128 __DEFAULT_FN_ATTRS 2437 _mm_castsi128_ps(__m128i __a) 2438 { 2439 return (__m128)__a; 2440 } 2441 2442 static __inline__ __m128d __DEFAULT_FN_ATTRS 2443 _mm_castsi128_pd(__m128i __a) 2444 { 2445 return (__m128d)__a; 2446 } 2447 2448 static __inline__ void __DEFAULT_FN_ATTRS 2449 _mm_pause(void) 2450 { 2451 __builtin_ia32_pause(); 2452 } 2453 2454 #undef __DEFAULT_FN_ATTRS 2455 2456 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 2457 2458 #endif /* __EMMINTRIN_H */ 2459