1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __EMMINTRIN_H 25 #define __EMMINTRIN_H 26 27 #ifndef __SSE2__ 28 #error "SSE2 instruction set not enabled" 29 #else 30 31 #include <xmmintrin.h> 32 33 typedef double __m128d __attribute__((__vector_size__(16))); 34 typedef long long __m128i __attribute__((__vector_size__(16))); 35 36 /* Type defines. */ 37 typedef double __v2df __attribute__ ((__vector_size__ (16))); 38 typedef long long __v2di __attribute__ ((__vector_size__ (16))); 39 typedef short __v8hi __attribute__((__vector_size__(16))); 40 typedef char __v16qi __attribute__((__vector_size__(16))); 41 42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 43 _mm_add_sd(__m128d __a, __m128d __b) 44 { 45 __a[0] += __b[0]; 46 return __a; 47 } 48 49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 50 _mm_add_pd(__m128d __a, __m128d __b) 51 { 52 return __a + __b; 53 } 54 55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 56 _mm_sub_sd(__m128d __a, __m128d __b) 57 { 58 __a[0] -= __b[0]; 59 return __a; 60 } 61 62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 63 _mm_sub_pd(__m128d __a, __m128d __b) 64 { 65 return __a - __b; 66 } 67 68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 69 _mm_mul_sd(__m128d __a, __m128d __b) 70 { 71 __a[0] *= __b[0]; 72 return __a; 73 } 74 75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 76 _mm_mul_pd(__m128d __a, __m128d __b) 77 { 78 return __a * __b; 79 } 80 81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 82 _mm_div_sd(__m128d __a, __m128d __b) 83 { 84 __a[0] /= __b[0]; 85 return __a; 86 } 87 88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 89 _mm_div_pd(__m128d __a, __m128d __b) 90 { 91 return __a / __b; 92 } 93 94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 95 _mm_sqrt_sd(__m128d __a, __m128d __b) 96 { 97 __m128d __c = __builtin_ia32_sqrtsd(__b); 98 return (__m128d) { __c[0], __a[1] }; 99 } 100 101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 102 _mm_sqrt_pd(__m128d __a) 103 { 104 return __builtin_ia32_sqrtpd(__a); 105 } 106 107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 108 _mm_min_sd(__m128d __a, __m128d __b) 109 { 110 return __builtin_ia32_minsd(__a, __b); 111 } 112 113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 114 _mm_min_pd(__m128d __a, __m128d __b) 115 { 116 return __builtin_ia32_minpd(__a, __b); 117 } 118 119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 120 _mm_max_sd(__m128d __a, __m128d __b) 121 { 122 return __builtin_ia32_maxsd(__a, __b); 123 } 124 125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 126 _mm_max_pd(__m128d __a, __m128d __b) 127 { 128 return __builtin_ia32_maxpd(__a, __b); 129 } 130 131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 132 _mm_and_pd(__m128d __a, __m128d __b) 133 { 134 return (__m128d)((__v4si)__a & (__v4si)__b); 135 } 136 137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 138 _mm_andnot_pd(__m128d __a, __m128d __b) 139 { 140 return (__m128d)(~(__v4si)__a & (__v4si)__b); 141 } 142 143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 144 _mm_or_pd(__m128d __a, __m128d __b) 145 { 146 return (__m128d)((__v4si)__a | (__v4si)__b); 147 } 148 149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 150 _mm_xor_pd(__m128d __a, __m128d __b) 151 { 152 return (__m128d)((__v4si)__a ^ (__v4si)__b); 153 } 154 155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 156 _mm_cmpeq_pd(__m128d __a, __m128d __b) 157 { 158 return (__m128d)__builtin_ia32_cmppd(__a, __b, 0); 159 } 160 161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 162 _mm_cmplt_pd(__m128d __a, __m128d __b) 163 { 164 return (__m128d)__builtin_ia32_cmppd(__a, __b, 1); 165 } 166 167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 168 _mm_cmple_pd(__m128d __a, __m128d __b) 169 { 170 return (__m128d)__builtin_ia32_cmppd(__a, __b, 2); 171 } 172 173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 174 _mm_cmpgt_pd(__m128d __a, __m128d __b) 175 { 176 return (__m128d)__builtin_ia32_cmppd(__b, __a, 1); 177 } 178 179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 180 _mm_cmpge_pd(__m128d __a, __m128d __b) 181 { 182 return (__m128d)__builtin_ia32_cmppd(__b, __a, 2); 183 } 184 185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 186 _mm_cmpord_pd(__m128d __a, __m128d __b) 187 { 188 return (__m128d)__builtin_ia32_cmppd(__a, __b, 7); 189 } 190 191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 192 _mm_cmpunord_pd(__m128d __a, __m128d __b) 193 { 194 return (__m128d)__builtin_ia32_cmppd(__a, __b, 3); 195 } 196 197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 198 _mm_cmpneq_pd(__m128d __a, __m128d __b) 199 { 200 return (__m128d)__builtin_ia32_cmppd(__a, __b, 4); 201 } 202 203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 204 _mm_cmpnlt_pd(__m128d __a, __m128d __b) 205 { 206 return (__m128d)__builtin_ia32_cmppd(__a, __b, 5); 207 } 208 209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 210 _mm_cmpnle_pd(__m128d __a, __m128d __b) 211 { 212 return (__m128d)__builtin_ia32_cmppd(__a, __b, 6); 213 } 214 215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 216 _mm_cmpngt_pd(__m128d __a, __m128d __b) 217 { 218 return (__m128d)__builtin_ia32_cmppd(__b, __a, 5); 219 } 220 221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 222 _mm_cmpnge_pd(__m128d __a, __m128d __b) 223 { 224 return (__m128d)__builtin_ia32_cmppd(__b, __a, 6); 225 } 226 227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 228 _mm_cmpeq_sd(__m128d __a, __m128d __b) 229 { 230 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0); 231 } 232 233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 234 _mm_cmplt_sd(__m128d __a, __m128d __b) 235 { 236 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1); 237 } 238 239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 240 _mm_cmple_sd(__m128d __a, __m128d __b) 241 { 242 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2); 243 } 244 245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 246 _mm_cmpgt_sd(__m128d __a, __m128d __b) 247 { 248 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 1); 249 return (__m128d) { __c[0], __a[1] }; 250 } 251 252 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 253 _mm_cmpge_sd(__m128d __a, __m128d __b) 254 { 255 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 2); 256 return (__m128d) { __c[0], __a[1] }; 257 } 258 259 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 260 _mm_cmpord_sd(__m128d __a, __m128d __b) 261 { 262 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7); 263 } 264 265 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 266 _mm_cmpunord_sd(__m128d __a, __m128d __b) 267 { 268 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3); 269 } 270 271 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 272 _mm_cmpneq_sd(__m128d __a, __m128d __b) 273 { 274 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4); 275 } 276 277 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 278 _mm_cmpnlt_sd(__m128d __a, __m128d __b) 279 { 280 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5); 281 } 282 283 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 284 _mm_cmpnle_sd(__m128d __a, __m128d __b) 285 { 286 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6); 287 } 288 289 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 290 _mm_cmpngt_sd(__m128d __a, __m128d __b) 291 { 292 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 5); 293 return (__m128d) { __c[0], __a[1] }; 294 } 295 296 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 297 _mm_cmpnge_sd(__m128d __a, __m128d __b) 298 { 299 __m128d __c = __builtin_ia32_cmpsd(__b, __a, 6); 300 return (__m128d) { __c[0], __a[1] }; 301 } 302 303 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 304 _mm_comieq_sd(__m128d __a, __m128d __b) 305 { 306 return __builtin_ia32_comisdeq(__a, __b); 307 } 308 309 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 310 _mm_comilt_sd(__m128d __a, __m128d __b) 311 { 312 return __builtin_ia32_comisdlt(__a, __b); 313 } 314 315 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 316 _mm_comile_sd(__m128d __a, __m128d __b) 317 { 318 return __builtin_ia32_comisdle(__a, __b); 319 } 320 321 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 322 _mm_comigt_sd(__m128d __a, __m128d __b) 323 { 324 return __builtin_ia32_comisdgt(__a, __b); 325 } 326 327 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 328 _mm_comige_sd(__m128d __a, __m128d __b) 329 { 330 return __builtin_ia32_comisdge(__a, __b); 331 } 332 333 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 334 _mm_comineq_sd(__m128d __a, __m128d __b) 335 { 336 return __builtin_ia32_comisdneq(__a, __b); 337 } 338 339 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 340 _mm_ucomieq_sd(__m128d __a, __m128d __b) 341 { 342 return __builtin_ia32_ucomisdeq(__a, __b); 343 } 344 345 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 346 _mm_ucomilt_sd(__m128d __a, __m128d __b) 347 { 348 return __builtin_ia32_ucomisdlt(__a, __b); 349 } 350 351 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 352 _mm_ucomile_sd(__m128d __a, __m128d __b) 353 { 354 return __builtin_ia32_ucomisdle(__a, __b); 355 } 356 357 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 358 _mm_ucomigt_sd(__m128d __a, __m128d __b) 359 { 360 return __builtin_ia32_ucomisdgt(__a, __b); 361 } 362 363 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 364 _mm_ucomige_sd(__m128d __a, __m128d __b) 365 { 366 return __builtin_ia32_ucomisdge(__a, __b); 367 } 368 369 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 370 _mm_ucomineq_sd(__m128d __a, __m128d __b) 371 { 372 return __builtin_ia32_ucomisdneq(__a, __b); 373 } 374 375 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 376 _mm_cvtpd_ps(__m128d __a) 377 { 378 return __builtin_ia32_cvtpd2ps(__a); 379 } 380 381 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 382 _mm_cvtps_pd(__m128 __a) 383 { 384 return __builtin_ia32_cvtps2pd(__a); 385 } 386 387 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 388 _mm_cvtepi32_pd(__m128i __a) 389 { 390 return __builtin_ia32_cvtdq2pd((__v4si)__a); 391 } 392 393 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 394 _mm_cvtpd_epi32(__m128d __a) 395 { 396 return __builtin_ia32_cvtpd2dq(__a); 397 } 398 399 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 400 _mm_cvtsd_si32(__m128d __a) 401 { 402 return __builtin_ia32_cvtsd2si(__a); 403 } 404 405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 406 _mm_cvtsd_ss(__m128 __a, __m128d __b) 407 { 408 __a[0] = __b[0]; 409 return __a; 410 } 411 412 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 413 _mm_cvtsi32_sd(__m128d __a, int __b) 414 { 415 __a[0] = __b; 416 return __a; 417 } 418 419 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 420 _mm_cvtss_sd(__m128d __a, __m128 __b) 421 { 422 __a[0] = __b[0]; 423 return __a; 424 } 425 426 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 427 _mm_cvttpd_epi32(__m128d __a) 428 { 429 return (__m128i)__builtin_ia32_cvttpd2dq(__a); 430 } 431 432 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 433 _mm_cvttsd_si32(__m128d __a) 434 { 435 return __a[0]; 436 } 437 438 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 439 _mm_cvtpd_pi32(__m128d __a) 440 { 441 return (__m64)__builtin_ia32_cvtpd2pi(__a); 442 } 443 444 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 445 _mm_cvttpd_pi32(__m128d __a) 446 { 447 return (__m64)__builtin_ia32_cvttpd2pi(__a); 448 } 449 450 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 451 _mm_cvtpi32_pd(__m64 __a) 452 { 453 return __builtin_ia32_cvtpi2pd((__v2si)__a); 454 } 455 456 static __inline__ double __attribute__((__always_inline__, __nodebug__)) 457 _mm_cvtsd_f64(__m128d __a) 458 { 459 return __a[0]; 460 } 461 462 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 463 _mm_load_pd(double const *__dp) 464 { 465 return *(__m128d*)__dp; 466 } 467 468 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 469 _mm_load1_pd(double const *__dp) 470 { 471 struct __mm_load1_pd_struct { 472 double __u; 473 } __attribute__((__packed__, __may_alias__)); 474 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 475 return (__m128d){ __u, __u }; 476 } 477 478 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 479 480 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 481 _mm_loadr_pd(double const *__dp) 482 { 483 __m128d __u = *(__m128d*)__dp; 484 return __builtin_shufflevector(__u, __u, 1, 0); 485 } 486 487 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 488 _mm_loadu_pd(double const *__dp) 489 { 490 struct __loadu_pd { 491 __m128d __v; 492 } __attribute__((packed, may_alias)); 493 return ((struct __loadu_pd*)__dp)->__v; 494 } 495 496 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 497 _mm_load_sd(double const *__dp) 498 { 499 struct __mm_load_sd_struct { 500 double __u; 501 } __attribute__((__packed__, __may_alias__)); 502 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 503 return (__m128d){ __u, 0 }; 504 } 505 506 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 507 _mm_loadh_pd(__m128d __a, double const *__dp) 508 { 509 struct __mm_loadh_pd_struct { 510 double __u; 511 } __attribute__((__packed__, __may_alias__)); 512 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 513 return (__m128d){ __a[0], __u }; 514 } 515 516 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 517 _mm_loadl_pd(__m128d __a, double const *__dp) 518 { 519 struct __mm_loadl_pd_struct { 520 double __u; 521 } __attribute__((__packed__, __may_alias__)); 522 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 523 return (__m128d){ __u, __a[1] }; 524 } 525 526 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 527 _mm_set_sd(double __w) 528 { 529 return (__m128d){ __w, 0 }; 530 } 531 532 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 533 _mm_set1_pd(double __w) 534 { 535 return (__m128d){ __w, __w }; 536 } 537 538 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 539 _mm_set_pd(double __w, double __x) 540 { 541 return (__m128d){ __x, __w }; 542 } 543 544 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 545 _mm_setr_pd(double __w, double __x) 546 { 547 return (__m128d){ __w, __x }; 548 } 549 550 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 551 _mm_setzero_pd(void) 552 { 553 return (__m128d){ 0, 0 }; 554 } 555 556 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 557 _mm_move_sd(__m128d __a, __m128d __b) 558 { 559 return (__m128d){ __b[0], __a[1] }; 560 } 561 562 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 563 _mm_store_sd(double *__dp, __m128d __a) 564 { 565 struct __mm_store_sd_struct { 566 double __u; 567 } __attribute__((__packed__, __may_alias__)); 568 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 569 } 570 571 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 572 _mm_store1_pd(double *__dp, __m128d __a) 573 { 574 struct __mm_store1_pd_struct { 575 double __u[2]; 576 } __attribute__((__packed__, __may_alias__)); 577 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0]; 578 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0]; 579 } 580 581 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 582 _mm_store_pd(double *__dp, __m128d __a) 583 { 584 *(__m128d *)__dp = __a; 585 } 586 587 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 588 _mm_storeu_pd(double *__dp, __m128d __a) 589 { 590 __builtin_ia32_storeupd(__dp, __a); 591 } 592 593 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 594 _mm_storer_pd(double *__dp, __m128d __a) 595 { 596 __a = __builtin_shufflevector(__a, __a, 1, 0); 597 *(__m128d *)__dp = __a; 598 } 599 600 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 601 _mm_storeh_pd(double *__dp, __m128d __a) 602 { 603 struct __mm_storeh_pd_struct { 604 double __u; 605 } __attribute__((__packed__, __may_alias__)); 606 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 607 } 608 609 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 610 _mm_storel_pd(double *__dp, __m128d __a) 611 { 612 struct __mm_storeh_pd_struct { 613 double __u; 614 } __attribute__((__packed__, __may_alias__)); 615 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 616 } 617 618 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 619 _mm_add_epi8(__m128i __a, __m128i __b) 620 { 621 return (__m128i)((__v16qi)__a + (__v16qi)__b); 622 } 623 624 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 625 _mm_add_epi16(__m128i __a, __m128i __b) 626 { 627 return (__m128i)((__v8hi)__a + (__v8hi)__b); 628 } 629 630 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 631 _mm_add_epi32(__m128i __a, __m128i __b) 632 { 633 return (__m128i)((__v4si)__a + (__v4si)__b); 634 } 635 636 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 637 _mm_add_si64(__m64 __a, __m64 __b) 638 { 639 return __a + __b; 640 } 641 642 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 643 _mm_add_epi64(__m128i __a, __m128i __b) 644 { 645 return __a + __b; 646 } 647 648 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 649 _mm_adds_epi8(__m128i __a, __m128i __b) 650 { 651 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 652 } 653 654 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 655 _mm_adds_epi16(__m128i __a, __m128i __b) 656 { 657 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 658 } 659 660 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 661 _mm_adds_epu8(__m128i __a, __m128i __b) 662 { 663 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 664 } 665 666 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 667 _mm_adds_epu16(__m128i __a, __m128i __b) 668 { 669 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 670 } 671 672 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 673 _mm_avg_epu8(__m128i __a, __m128i __b) 674 { 675 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 676 } 677 678 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 679 _mm_avg_epu16(__m128i __a, __m128i __b) 680 { 681 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 682 } 683 684 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 685 _mm_madd_epi16(__m128i __a, __m128i __b) 686 { 687 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 688 } 689 690 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 691 _mm_max_epi16(__m128i __a, __m128i __b) 692 { 693 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 694 } 695 696 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 697 _mm_max_epu8(__m128i __a, __m128i __b) 698 { 699 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 700 } 701 702 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 703 _mm_min_epi16(__m128i __a, __m128i __b) 704 { 705 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 706 } 707 708 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 709 _mm_min_epu8(__m128i __a, __m128i __b) 710 { 711 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 712 } 713 714 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 715 _mm_mulhi_epi16(__m128i __a, __m128i __b) 716 { 717 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 718 } 719 720 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 721 _mm_mulhi_epu16(__m128i __a, __m128i __b) 722 { 723 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 724 } 725 726 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 727 _mm_mullo_epi16(__m128i __a, __m128i __b) 728 { 729 return (__m128i)((__v8hi)__a * (__v8hi)__b); 730 } 731 732 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 733 _mm_mul_su32(__m64 __a, __m64 __b) 734 { 735 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 736 } 737 738 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 739 _mm_mul_epu32(__m128i __a, __m128i __b) 740 { 741 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 742 } 743 744 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 745 _mm_sad_epu8(__m128i __a, __m128i __b) 746 { 747 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 748 } 749 750 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 751 _mm_sub_epi8(__m128i __a, __m128i __b) 752 { 753 return (__m128i)((__v16qi)__a - (__v16qi)__b); 754 } 755 756 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 757 _mm_sub_epi16(__m128i __a, __m128i __b) 758 { 759 return (__m128i)((__v8hi)__a - (__v8hi)__b); 760 } 761 762 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 763 _mm_sub_epi32(__m128i __a, __m128i __b) 764 { 765 return (__m128i)((__v4si)__a - (__v4si)__b); 766 } 767 768 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 769 _mm_sub_si64(__m64 __a, __m64 __b) 770 { 771 return __a - __b; 772 } 773 774 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 775 _mm_sub_epi64(__m128i __a, __m128i __b) 776 { 777 return __a - __b; 778 } 779 780 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 781 _mm_subs_epi8(__m128i __a, __m128i __b) 782 { 783 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 784 } 785 786 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 787 _mm_subs_epi16(__m128i __a, __m128i __b) 788 { 789 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 790 } 791 792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 793 _mm_subs_epu8(__m128i __a, __m128i __b) 794 { 795 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 796 } 797 798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 799 _mm_subs_epu16(__m128i __a, __m128i __b) 800 { 801 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 802 } 803 804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 805 _mm_and_si128(__m128i __a, __m128i __b) 806 { 807 return __a & __b; 808 } 809 810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 811 _mm_andnot_si128(__m128i __a, __m128i __b) 812 { 813 return ~__a & __b; 814 } 815 816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 817 _mm_or_si128(__m128i __a, __m128i __b) 818 { 819 return __a | __b; 820 } 821 822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 823 _mm_xor_si128(__m128i __a, __m128i __b) 824 { 825 return __a ^ __b; 826 } 827 828 #define _mm_slli_si128(a, count) __extension__ ({ \ 829 __m128i __a = (a); \ 830 (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); }) 831 832 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 833 _mm_slli_epi16(__m128i __a, int __count) 834 { 835 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 836 } 837 838 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 839 _mm_sll_epi16(__m128i __a, __m128i __count) 840 { 841 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 842 } 843 844 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 845 _mm_slli_epi32(__m128i __a, int __count) 846 { 847 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 848 } 849 850 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 851 _mm_sll_epi32(__m128i __a, __m128i __count) 852 { 853 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 854 } 855 856 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 857 _mm_slli_epi64(__m128i __a, int __count) 858 { 859 return __builtin_ia32_psllqi128(__a, __count); 860 } 861 862 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 863 _mm_sll_epi64(__m128i __a, __m128i __count) 864 { 865 return __builtin_ia32_psllq128(__a, __count); 866 } 867 868 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 869 _mm_srai_epi16(__m128i __a, int __count) 870 { 871 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 872 } 873 874 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 875 _mm_sra_epi16(__m128i __a, __m128i __count) 876 { 877 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 878 } 879 880 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 881 _mm_srai_epi32(__m128i __a, int __count) 882 { 883 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 884 } 885 886 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 887 _mm_sra_epi32(__m128i __a, __m128i __count) 888 { 889 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 890 } 891 892 893 #define _mm_srli_si128(a, count) __extension__ ({ \ 894 __m128i __a = (a); \ 895 (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); }) 896 897 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 898 _mm_srli_epi16(__m128i __a, int __count) 899 { 900 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 901 } 902 903 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 904 _mm_srl_epi16(__m128i __a, __m128i __count) 905 { 906 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 907 } 908 909 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 910 _mm_srli_epi32(__m128i __a, int __count) 911 { 912 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 913 } 914 915 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 916 _mm_srl_epi32(__m128i __a, __m128i __count) 917 { 918 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 919 } 920 921 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 922 _mm_srli_epi64(__m128i __a, int __count) 923 { 924 return __builtin_ia32_psrlqi128(__a, __count); 925 } 926 927 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 928 _mm_srl_epi64(__m128i __a, __m128i __count) 929 { 930 return __builtin_ia32_psrlq128(__a, __count); 931 } 932 933 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 934 _mm_cmpeq_epi8(__m128i __a, __m128i __b) 935 { 936 return (__m128i)((__v16qi)__a == (__v16qi)__b); 937 } 938 939 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 940 _mm_cmpeq_epi16(__m128i __a, __m128i __b) 941 { 942 return (__m128i)((__v8hi)__a == (__v8hi)__b); 943 } 944 945 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 946 _mm_cmpeq_epi32(__m128i __a, __m128i __b) 947 { 948 return (__m128i)((__v4si)__a == (__v4si)__b); 949 } 950 951 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 952 _mm_cmpgt_epi8(__m128i __a, __m128i __b) 953 { 954 /* This function always performs a signed comparison, but __v16qi is a char 955 which may be signed or unsigned. */ 956 typedef signed char __v16qs __attribute__((__vector_size__(16))); 957 return (__m128i)((__v16qs)__a > (__v16qs)__b); 958 } 959 960 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 961 _mm_cmpgt_epi16(__m128i __a, __m128i __b) 962 { 963 return (__m128i)((__v8hi)__a > (__v8hi)__b); 964 } 965 966 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 967 _mm_cmpgt_epi32(__m128i __a, __m128i __b) 968 { 969 return (__m128i)((__v4si)__a > (__v4si)__b); 970 } 971 972 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 973 _mm_cmplt_epi8(__m128i __a, __m128i __b) 974 { 975 return _mm_cmpgt_epi8(__b, __a); 976 } 977 978 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 979 _mm_cmplt_epi16(__m128i __a, __m128i __b) 980 { 981 return _mm_cmpgt_epi16(__b, __a); 982 } 983 984 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 985 _mm_cmplt_epi32(__m128i __a, __m128i __b) 986 { 987 return _mm_cmpgt_epi32(__b, __a); 988 } 989 990 #ifdef __x86_64__ 991 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 992 _mm_cvtsi64_sd(__m128d __a, long long __b) 993 { 994 __a[0] = __b; 995 return __a; 996 } 997 998 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 999 _mm_cvtsd_si64(__m128d __a) 1000 { 1001 return __builtin_ia32_cvtsd2si64(__a); 1002 } 1003 1004 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1005 _mm_cvttsd_si64(__m128d __a) 1006 { 1007 return __a[0]; 1008 } 1009 #endif 1010 1011 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1012 _mm_cvtepi32_ps(__m128i __a) 1013 { 1014 return __builtin_ia32_cvtdq2ps((__v4si)__a); 1015 } 1016 1017 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1018 _mm_cvtps_epi32(__m128 __a) 1019 { 1020 return (__m128i)__builtin_ia32_cvtps2dq(__a); 1021 } 1022 1023 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1024 _mm_cvttps_epi32(__m128 __a) 1025 { 1026 return (__m128i)__builtin_ia32_cvttps2dq(__a); 1027 } 1028 1029 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1030 _mm_cvtsi32_si128(int __a) 1031 { 1032 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 1033 } 1034 1035 #ifdef __x86_64__ 1036 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1037 _mm_cvtsi64_si128(long long __a) 1038 { 1039 return (__m128i){ __a, 0 }; 1040 } 1041 #endif 1042 1043 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1044 _mm_cvtsi128_si32(__m128i __a) 1045 { 1046 __v4si __b = (__v4si)__a; 1047 return __b[0]; 1048 } 1049 1050 #ifdef __x86_64__ 1051 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1052 _mm_cvtsi128_si64(__m128i __a) 1053 { 1054 return __a[0]; 1055 } 1056 #endif 1057 1058 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1059 _mm_load_si128(__m128i const *__p) 1060 { 1061 return *__p; 1062 } 1063 1064 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1065 _mm_loadu_si128(__m128i const *__p) 1066 { 1067 struct __loadu_si128 { 1068 __m128i __v; 1069 } __attribute__((packed, may_alias)); 1070 return ((struct __loadu_si128*)__p)->__v; 1071 } 1072 1073 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1074 _mm_loadl_epi64(__m128i const *__p) 1075 { 1076 struct __mm_loadl_epi64_struct { 1077 long long __u; 1078 } __attribute__((__packed__, __may_alias__)); 1079 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 1080 } 1081 1082 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1083 _mm_set_epi64x(long long q1, long long q0) 1084 { 1085 return (__m128i){ q0, q1 }; 1086 } 1087 1088 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1089 _mm_set_epi64(__m64 q1, __m64 q0) 1090 { 1091 return (__m128i){ (long long)q0, (long long)q1 }; 1092 } 1093 1094 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1095 _mm_set_epi32(int i3, int i2, int i1, int i0) 1096 { 1097 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1098 } 1099 1100 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1101 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1102 { 1103 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1104 } 1105 1106 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1107 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1108 { 1109 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1110 } 1111 1112 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1113 _mm_set1_epi64x(long long __q) 1114 { 1115 return (__m128i){ __q, __q }; 1116 } 1117 1118 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1119 _mm_set1_epi64(__m64 __q) 1120 { 1121 return (__m128i){ (long long)__q, (long long)__q }; 1122 } 1123 1124 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1125 _mm_set1_epi32(int __i) 1126 { 1127 return (__m128i)(__v4si){ __i, __i, __i, __i }; 1128 } 1129 1130 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1131 _mm_set1_epi16(short __w) 1132 { 1133 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 1134 } 1135 1136 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1137 _mm_set1_epi8(char __b) 1138 { 1139 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 1140 } 1141 1142 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1143 _mm_setr_epi64(__m64 q0, __m64 q1) 1144 { 1145 return (__m128i){ (long long)q0, (long long)q1 }; 1146 } 1147 1148 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1149 _mm_setr_epi32(int i0, int i1, int i2, int i3) 1150 { 1151 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1152 } 1153 1154 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1155 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1156 { 1157 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1158 } 1159 1160 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1161 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1162 { 1163 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1164 } 1165 1166 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1167 _mm_setzero_si128(void) 1168 { 1169 return (__m128i){ 0LL, 0LL }; 1170 } 1171 1172 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1173 _mm_store_si128(__m128i *__p, __m128i __b) 1174 { 1175 *__p = __b; 1176 } 1177 1178 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1179 _mm_storeu_si128(__m128i *__p, __m128i __b) 1180 { 1181 __builtin_ia32_storedqu((char *)__p, (__v16qi)__b); 1182 } 1183 1184 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1185 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 1186 { 1187 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 1188 } 1189 1190 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1191 _mm_storel_epi64(__m128i *__p, __m128i __a) 1192 { 1193 struct __mm_storel_epi64_struct { 1194 long long __u; 1195 } __attribute__((__packed__, __may_alias__)); 1196 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 1197 } 1198 1199 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1200 _mm_stream_pd(double *__p, __m128d __a) 1201 { 1202 __builtin_ia32_movntpd(__p, __a); 1203 } 1204 1205 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1206 _mm_stream_si128(__m128i *__p, __m128i __a) 1207 { 1208 __builtin_ia32_movntdq(__p, __a); 1209 } 1210 1211 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1212 _mm_stream_si32(int *__p, int __a) 1213 { 1214 __builtin_ia32_movnti(__p, __a); 1215 } 1216 1217 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1218 _mm_clflush(void const *__p) 1219 { 1220 __builtin_ia32_clflush(__p); 1221 } 1222 1223 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1224 _mm_lfence(void) 1225 { 1226 __builtin_ia32_lfence(); 1227 } 1228 1229 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1230 _mm_mfence(void) 1231 { 1232 __builtin_ia32_mfence(); 1233 } 1234 1235 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1236 _mm_packs_epi16(__m128i __a, __m128i __b) 1237 { 1238 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 1239 } 1240 1241 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1242 _mm_packs_epi32(__m128i __a, __m128i __b) 1243 { 1244 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 1245 } 1246 1247 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1248 _mm_packus_epi16(__m128i __a, __m128i __b) 1249 { 1250 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 1251 } 1252 1253 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1254 _mm_extract_epi16(__m128i __a, int __imm) 1255 { 1256 __v8hi __b = (__v8hi)__a; 1257 return (unsigned short)__b[__imm]; 1258 } 1259 1260 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1261 _mm_insert_epi16(__m128i __a, int __b, int __imm) 1262 { 1263 __v8hi __c = (__v8hi)__a; 1264 __c[__imm & 7] = __b; 1265 return (__m128i)__c; 1266 } 1267 1268 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1269 _mm_movemask_epi8(__m128i __a) 1270 { 1271 return __builtin_ia32_pmovmskb128((__v16qi)__a); 1272 } 1273 1274 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 1275 __m128i __a = (a); \ 1276 (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \ 1277 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1278 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) 1279 1280 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 1281 __m128i __a = (a); \ 1282 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 1283 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1284 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 1285 4, 5, 6, 7); }) 1286 1287 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 1288 __m128i __a = (a); \ 1289 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 1290 0, 1, 2, 3, \ 1291 4 + (((imm) & 0x03) >> 0), \ 1292 4 + (((imm) & 0x0c) >> 2), \ 1293 4 + (((imm) & 0x30) >> 4), \ 1294 4 + (((imm) & 0xc0) >> 6)); }) 1295 1296 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1297 _mm_unpackhi_epi8(__m128i __a, __m128i __b) 1298 { 1299 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1300 } 1301 1302 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1303 _mm_unpackhi_epi16(__m128i __a, __m128i __b) 1304 { 1305 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1306 } 1307 1308 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1309 _mm_unpackhi_epi32(__m128i __a, __m128i __b) 1310 { 1311 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 1312 } 1313 1314 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1315 _mm_unpackhi_epi64(__m128i __a, __m128i __b) 1316 { 1317 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1); 1318 } 1319 1320 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1321 _mm_unpacklo_epi8(__m128i __a, __m128i __b) 1322 { 1323 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1324 } 1325 1326 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1327 _mm_unpacklo_epi16(__m128i __a, __m128i __b) 1328 { 1329 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1330 } 1331 1332 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1333 _mm_unpacklo_epi32(__m128i __a, __m128i __b) 1334 { 1335 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 1336 } 1337 1338 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1339 _mm_unpacklo_epi64(__m128i __a, __m128i __b) 1340 { 1341 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0); 1342 } 1343 1344 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 1345 _mm_movepi64_pi64(__m128i __a) 1346 { 1347 return (__m64)__a[0]; 1348 } 1349 1350 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1351 _mm_movpi64_pi64(__m64 __a) 1352 { 1353 return (__m128i){ (long long)__a, 0 }; 1354 } 1355 1356 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1357 _mm_move_epi64(__m128i __a) 1358 { 1359 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2); 1360 } 1361 1362 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1363 _mm_unpackhi_pd(__m128d __a, __m128d __b) 1364 { 1365 return __builtin_shufflevector(__a, __b, 1, 2+1); 1366 } 1367 1368 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1369 _mm_unpacklo_pd(__m128d __a, __m128d __b) 1370 { 1371 return __builtin_shufflevector(__a, __b, 0, 2+0); 1372 } 1373 1374 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1375 _mm_movemask_pd(__m128d __a) 1376 { 1377 return __builtin_ia32_movmskpd(__a); 1378 } 1379 1380 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 1381 __m128d __a = (a); \ 1382 __m128d __b = (b); \ 1383 __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); }) 1384 1385 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1386 _mm_castpd_ps(__m128d __a) 1387 { 1388 return (__m128)__a; 1389 } 1390 1391 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1392 _mm_castpd_si128(__m128d __a) 1393 { 1394 return (__m128i)__a; 1395 } 1396 1397 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1398 _mm_castps_pd(__m128 __a) 1399 { 1400 return (__m128d)__a; 1401 } 1402 1403 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1404 _mm_castps_si128(__m128 __a) 1405 { 1406 return (__m128i)__a; 1407 } 1408 1409 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1410 _mm_castsi128_ps(__m128i __a) 1411 { 1412 return (__m128)__a; 1413 } 1414 1415 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1416 _mm_castsi128_pd(__m128i __a) 1417 { 1418 return (__m128d)__a; 1419 } 1420 1421 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1422 _mm_pause(void) 1423 { 1424 __asm__ volatile ("pause"); 1425 } 1426 1427 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1428 1429 #endif /* __SSE2__ */ 1430 1431 #endif /* __EMMINTRIN_H */ 1432