1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __EMMINTRIN_H 25 #define __EMMINTRIN_H 26 27 #ifndef __SSE2__ 28 #error "SSE2 instruction set not enabled" 29 #else 30 31 #include <xmmintrin.h> 32 33 typedef double __m128d __attribute__((__vector_size__(16))); 34 typedef long long __m128i __attribute__((__vector_size__(16))); 35 36 /* Type defines. */ 37 typedef double __v2df __attribute__ ((__vector_size__ (16))); 38 typedef long long __v2di __attribute__ ((__vector_size__ (16))); 39 typedef short __v8hi __attribute__((__vector_size__(16))); 40 typedef char __v16qi __attribute__((__vector_size__(16))); 41 42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 43 _mm_add_sd(__m128d a, __m128d b) 44 { 45 a[0] += b[0]; 46 return a; 47 } 48 49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 50 _mm_add_pd(__m128d a, __m128d b) 51 { 52 return a + b; 53 } 54 55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 56 _mm_sub_sd(__m128d a, __m128d b) 57 { 58 a[0] -= b[0]; 59 return a; 60 } 61 62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 63 _mm_sub_pd(__m128d a, __m128d b) 64 { 65 return a - b; 66 } 67 68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 69 _mm_mul_sd(__m128d a, __m128d b) 70 { 71 a[0] *= b[0]; 72 return a; 73 } 74 75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 76 _mm_mul_pd(__m128d a, __m128d b) 77 { 78 return a * b; 79 } 80 81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 82 _mm_div_sd(__m128d a, __m128d b) 83 { 84 a[0] /= b[0]; 85 return a; 86 } 87 88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 89 _mm_div_pd(__m128d a, __m128d b) 90 { 91 return a / b; 92 } 93 94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 95 _mm_sqrt_sd(__m128d a, __m128d b) 96 { 97 __m128d c = __builtin_ia32_sqrtsd(b); 98 return (__m128d) { c[0], a[1] }; 99 } 100 101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 102 _mm_sqrt_pd(__m128d a) 103 { 104 return __builtin_ia32_sqrtpd(a); 105 } 106 107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 108 _mm_min_sd(__m128d a, __m128d b) 109 { 110 return __builtin_ia32_minsd(a, b); 111 } 112 113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 114 _mm_min_pd(__m128d a, __m128d b) 115 { 116 return __builtin_ia32_minpd(a, b); 117 } 118 119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 120 _mm_max_sd(__m128d a, __m128d b) 121 { 122 return __builtin_ia32_maxsd(a, b); 123 } 124 125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 126 _mm_max_pd(__m128d a, __m128d b) 127 { 128 return __builtin_ia32_maxpd(a, b); 129 } 130 131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 132 _mm_and_pd(__m128d a, __m128d b) 133 { 134 return (__m128d)((__v4si)a & (__v4si)b); 135 } 136 137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 138 _mm_andnot_pd(__m128d a, __m128d b) 139 { 140 return (__m128d)(~(__v4si)a & (__v4si)b); 141 } 142 143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 144 _mm_or_pd(__m128d a, __m128d b) 145 { 146 return (__m128d)((__v4si)a | (__v4si)b); 147 } 148 149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 150 _mm_xor_pd(__m128d a, __m128d b) 151 { 152 return (__m128d)((__v4si)a ^ (__v4si)b); 153 } 154 155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 156 _mm_cmpeq_pd(__m128d a, __m128d b) 157 { 158 return (__m128d)__builtin_ia32_cmppd(a, b, 0); 159 } 160 161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 162 _mm_cmplt_pd(__m128d a, __m128d b) 163 { 164 return (__m128d)__builtin_ia32_cmppd(a, b, 1); 165 } 166 167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 168 _mm_cmple_pd(__m128d a, __m128d b) 169 { 170 return (__m128d)__builtin_ia32_cmppd(a, b, 2); 171 } 172 173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 174 _mm_cmpgt_pd(__m128d a, __m128d b) 175 { 176 return (__m128d)__builtin_ia32_cmppd(b, a, 1); 177 } 178 179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 180 _mm_cmpge_pd(__m128d a, __m128d b) 181 { 182 return (__m128d)__builtin_ia32_cmppd(b, a, 2); 183 } 184 185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 186 _mm_cmpord_pd(__m128d a, __m128d b) 187 { 188 return (__m128d)__builtin_ia32_cmppd(a, b, 7); 189 } 190 191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 192 _mm_cmpunord_pd(__m128d a, __m128d b) 193 { 194 return (__m128d)__builtin_ia32_cmppd(a, b, 3); 195 } 196 197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 198 _mm_cmpneq_pd(__m128d a, __m128d b) 199 { 200 return (__m128d)__builtin_ia32_cmppd(a, b, 4); 201 } 202 203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 204 _mm_cmpnlt_pd(__m128d a, __m128d b) 205 { 206 return (__m128d)__builtin_ia32_cmppd(a, b, 5); 207 } 208 209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 210 _mm_cmpnle_pd(__m128d a, __m128d b) 211 { 212 return (__m128d)__builtin_ia32_cmppd(a, b, 6); 213 } 214 215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 216 _mm_cmpngt_pd(__m128d a, __m128d b) 217 { 218 return (__m128d)__builtin_ia32_cmppd(b, a, 5); 219 } 220 221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 222 _mm_cmpnge_pd(__m128d a, __m128d b) 223 { 224 return (__m128d)__builtin_ia32_cmppd(b, a, 6); 225 } 226 227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 228 _mm_cmpeq_sd(__m128d a, __m128d b) 229 { 230 return (__m128d)__builtin_ia32_cmpsd(a, b, 0); 231 } 232 233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 234 _mm_cmplt_sd(__m128d a, __m128d b) 235 { 236 return (__m128d)__builtin_ia32_cmpsd(a, b, 1); 237 } 238 239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 240 _mm_cmple_sd(__m128d a, __m128d b) 241 { 242 return (__m128d)__builtin_ia32_cmpsd(a, b, 2); 243 } 244 245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 246 _mm_cmpgt_sd(__m128d a, __m128d b) 247 { 248 return (__m128d)__builtin_ia32_cmpsd(b, a, 1); 249 } 250 251 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 252 _mm_cmpge_sd(__m128d a, __m128d b) 253 { 254 return (__m128d)__builtin_ia32_cmpsd(b, a, 2); 255 } 256 257 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 258 _mm_cmpord_sd(__m128d a, __m128d b) 259 { 260 return (__m128d)__builtin_ia32_cmpsd(a, b, 7); 261 } 262 263 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 264 _mm_cmpunord_sd(__m128d a, __m128d b) 265 { 266 return (__m128d)__builtin_ia32_cmpsd(a, b, 3); 267 } 268 269 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 270 _mm_cmpneq_sd(__m128d a, __m128d b) 271 { 272 return (__m128d)__builtin_ia32_cmpsd(a, b, 4); 273 } 274 275 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 276 _mm_cmpnlt_sd(__m128d a, __m128d b) 277 { 278 return (__m128d)__builtin_ia32_cmpsd(a, b, 5); 279 } 280 281 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 282 _mm_cmpnle_sd(__m128d a, __m128d b) 283 { 284 return (__m128d)__builtin_ia32_cmpsd(a, b, 6); 285 } 286 287 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 288 _mm_cmpngt_sd(__m128d a, __m128d b) 289 { 290 return (__m128d)__builtin_ia32_cmpsd(b, a, 5); 291 } 292 293 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 294 _mm_cmpnge_sd(__m128d a, __m128d b) 295 { 296 return (__m128d)__builtin_ia32_cmpsd(b, a, 6); 297 } 298 299 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 300 _mm_comieq_sd(__m128d a, __m128d b) 301 { 302 return __builtin_ia32_comisdeq(a, b); 303 } 304 305 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 306 _mm_comilt_sd(__m128d a, __m128d b) 307 { 308 return __builtin_ia32_comisdlt(a, b); 309 } 310 311 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 312 _mm_comile_sd(__m128d a, __m128d b) 313 { 314 return __builtin_ia32_comisdle(a, b); 315 } 316 317 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 318 _mm_comigt_sd(__m128d a, __m128d b) 319 { 320 return __builtin_ia32_comisdgt(a, b); 321 } 322 323 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 324 _mm_comige_sd(__m128d a, __m128d b) 325 { 326 return __builtin_ia32_comisdge(a, b); 327 } 328 329 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 330 _mm_comineq_sd(__m128d a, __m128d b) 331 { 332 return __builtin_ia32_comisdneq(a, b); 333 } 334 335 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 336 _mm_ucomieq_sd(__m128d a, __m128d b) 337 { 338 return __builtin_ia32_ucomisdeq(a, b); 339 } 340 341 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 342 _mm_ucomilt_sd(__m128d a, __m128d b) 343 { 344 return __builtin_ia32_ucomisdlt(a, b); 345 } 346 347 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 348 _mm_ucomile_sd(__m128d a, __m128d b) 349 { 350 return __builtin_ia32_ucomisdle(a, b); 351 } 352 353 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 354 _mm_ucomigt_sd(__m128d a, __m128d b) 355 { 356 return __builtin_ia32_ucomisdgt(a, b); 357 } 358 359 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 360 _mm_ucomige_sd(__m128d a, __m128d b) 361 { 362 return __builtin_ia32_ucomisdge(a, b); 363 } 364 365 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 366 _mm_ucomineq_sd(__m128d a, __m128d b) 367 { 368 return __builtin_ia32_ucomisdneq(a, b); 369 } 370 371 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 372 _mm_cvtpd_ps(__m128d a) 373 { 374 return __builtin_ia32_cvtpd2ps(a); 375 } 376 377 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 378 _mm_cvtps_pd(__m128 a) 379 { 380 return __builtin_ia32_cvtps2pd(a); 381 } 382 383 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 384 _mm_cvtepi32_pd(__m128i a) 385 { 386 return __builtin_ia32_cvtdq2pd((__v4si)a); 387 } 388 389 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 390 _mm_cvtpd_epi32(__m128d a) 391 { 392 return __builtin_ia32_cvtpd2dq(a); 393 } 394 395 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 396 _mm_cvtsd_si32(__m128d a) 397 { 398 return __builtin_ia32_cvtsd2si(a); 399 } 400 401 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 402 _mm_cvtsd_ss(__m128 a, __m128d b) 403 { 404 a[0] = b[0]; 405 return a; 406 } 407 408 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 409 _mm_cvtsi32_sd(__m128d a, int b) 410 { 411 a[0] = b; 412 return a; 413 } 414 415 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 416 _mm_cvtss_sd(__m128d a, __m128 b) 417 { 418 a[0] = b[0]; 419 return a; 420 } 421 422 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 423 _mm_cvttpd_epi32(__m128d a) 424 { 425 return (__m128i)__builtin_ia32_cvttpd2dq(a); 426 } 427 428 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 429 _mm_cvttsd_si32(__m128d a) 430 { 431 return a[0]; 432 } 433 434 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 435 _mm_cvtpd_pi32(__m128d a) 436 { 437 return (__m64)__builtin_ia32_cvtpd2pi(a); 438 } 439 440 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 441 _mm_cvttpd_pi32(__m128d a) 442 { 443 return (__m64)__builtin_ia32_cvttpd2pi(a); 444 } 445 446 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 447 _mm_cvtpi32_pd(__m64 a) 448 { 449 return __builtin_ia32_cvtpi2pd((__v2si)a); 450 } 451 452 static __inline__ double __attribute__((__always_inline__, __nodebug__)) 453 _mm_cvtsd_f64(__m128d a) 454 { 455 return a[0]; 456 } 457 458 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 459 _mm_load_pd(double const *dp) 460 { 461 return *(__m128d*)dp; 462 } 463 464 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 465 _mm_load1_pd(double const *dp) 466 { 467 struct __mm_load1_pd_struct { 468 double u; 469 } __attribute__((__packed__, __may_alias__)); 470 double u = ((struct __mm_load1_pd_struct*)dp)->u; 471 return (__m128d){ u, u }; 472 } 473 474 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 475 476 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 477 _mm_loadr_pd(double const *dp) 478 { 479 __m128d u = *(__m128d*)dp; 480 return __builtin_shufflevector(u, u, 1, 0); 481 } 482 483 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 484 _mm_loadu_pd(double const *dp) 485 { 486 struct __loadu_pd { 487 __m128d v; 488 } __attribute__((packed, may_alias)); 489 return ((struct __loadu_pd*)dp)->v; 490 } 491 492 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 493 _mm_load_sd(double const *dp) 494 { 495 struct __mm_load_sd_struct { 496 double u; 497 } __attribute__((__packed__, __may_alias__)); 498 double u = ((struct __mm_load_sd_struct*)dp)->u; 499 return (__m128d){ u, 0 }; 500 } 501 502 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 503 _mm_loadh_pd(__m128d a, double const *dp) 504 { 505 struct __mm_loadh_pd_struct { 506 double u; 507 } __attribute__((__packed__, __may_alias__)); 508 double u = ((struct __mm_loadh_pd_struct*)dp)->u; 509 return (__m128d){ a[0], u }; 510 } 511 512 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 513 _mm_loadl_pd(__m128d a, double const *dp) 514 { 515 struct __mm_loadl_pd_struct { 516 double u; 517 } __attribute__((__packed__, __may_alias__)); 518 double u = ((struct __mm_loadl_pd_struct*)dp)->u; 519 return (__m128d){ u, a[1] }; 520 } 521 522 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 523 _mm_set_sd(double w) 524 { 525 return (__m128d){ w, 0 }; 526 } 527 528 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 529 _mm_set1_pd(double w) 530 { 531 return (__m128d){ w, w }; 532 } 533 534 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 535 _mm_set_pd(double w, double x) 536 { 537 return (__m128d){ x, w }; 538 } 539 540 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 541 _mm_setr_pd(double w, double x) 542 { 543 return (__m128d){ w, x }; 544 } 545 546 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 547 _mm_setzero_pd(void) 548 { 549 return (__m128d){ 0, 0 }; 550 } 551 552 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 553 _mm_move_sd(__m128d a, __m128d b) 554 { 555 return (__m128d){ b[0], a[1] }; 556 } 557 558 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 559 _mm_store_sd(double *dp, __m128d a) 560 { 561 struct __mm_store_sd_struct { 562 double u; 563 } __attribute__((__packed__, __may_alias__)); 564 ((struct __mm_store_sd_struct*)dp)->u = a[0]; 565 } 566 567 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 568 _mm_store1_pd(double *dp, __m128d a) 569 { 570 struct __mm_store1_pd_struct { 571 double u[2]; 572 } __attribute__((__packed__, __may_alias__)); 573 ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0]; 574 ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0]; 575 } 576 577 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 578 _mm_store_pd(double *dp, __m128d a) 579 { 580 *(__m128d *)dp = a; 581 } 582 583 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 584 _mm_storeu_pd(double *dp, __m128d a) 585 { 586 __builtin_ia32_storeupd(dp, a); 587 } 588 589 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 590 _mm_storer_pd(double *dp, __m128d a) 591 { 592 a = __builtin_shufflevector(a, a, 1, 0); 593 *(__m128d *)dp = a; 594 } 595 596 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 597 _mm_storeh_pd(double *dp, __m128d a) 598 { 599 struct __mm_storeh_pd_struct { 600 double u; 601 } __attribute__((__packed__, __may_alias__)); 602 ((struct __mm_storeh_pd_struct*)dp)->u = a[1]; 603 } 604 605 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 606 _mm_storel_pd(double *dp, __m128d a) 607 { 608 struct __mm_storeh_pd_struct { 609 double u; 610 } __attribute__((__packed__, __may_alias__)); 611 ((struct __mm_storeh_pd_struct*)dp)->u = a[0]; 612 } 613 614 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 615 _mm_add_epi8(__m128i a, __m128i b) 616 { 617 return (__m128i)((__v16qi)a + (__v16qi)b); 618 } 619 620 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 621 _mm_add_epi16(__m128i a, __m128i b) 622 { 623 return (__m128i)((__v8hi)a + (__v8hi)b); 624 } 625 626 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 627 _mm_add_epi32(__m128i a, __m128i b) 628 { 629 return (__m128i)((__v4si)a + (__v4si)b); 630 } 631 632 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 633 _mm_add_si64(__m64 a, __m64 b) 634 { 635 return a + b; 636 } 637 638 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 639 _mm_add_epi64(__m128i a, __m128i b) 640 { 641 return a + b; 642 } 643 644 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 645 _mm_adds_epi8(__m128i a, __m128i b) 646 { 647 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 648 } 649 650 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 651 _mm_adds_epi16(__m128i a, __m128i b) 652 { 653 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 654 } 655 656 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 657 _mm_adds_epu8(__m128i a, __m128i b) 658 { 659 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 660 } 661 662 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 663 _mm_adds_epu16(__m128i a, __m128i b) 664 { 665 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 666 } 667 668 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 669 _mm_avg_epu8(__m128i a, __m128i b) 670 { 671 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 672 } 673 674 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 675 _mm_avg_epu16(__m128i a, __m128i b) 676 { 677 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 678 } 679 680 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 681 _mm_madd_epi16(__m128i a, __m128i b) 682 { 683 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 684 } 685 686 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 687 _mm_max_epi16(__m128i a, __m128i b) 688 { 689 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 690 } 691 692 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 693 _mm_max_epu8(__m128i a, __m128i b) 694 { 695 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 696 } 697 698 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 699 _mm_min_epi16(__m128i a, __m128i b) 700 { 701 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 702 } 703 704 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 705 _mm_min_epu8(__m128i a, __m128i b) 706 { 707 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 708 } 709 710 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 711 _mm_mulhi_epi16(__m128i a, __m128i b) 712 { 713 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 714 } 715 716 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 717 _mm_mulhi_epu16(__m128i a, __m128i b) 718 { 719 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 720 } 721 722 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 723 _mm_mullo_epi16(__m128i a, __m128i b) 724 { 725 return (__m128i)((__v8hi)a * (__v8hi)b); 726 } 727 728 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 729 _mm_mul_su32(__m64 a, __m64 b) 730 { 731 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 732 } 733 734 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 735 _mm_mul_epu32(__m128i a, __m128i b) 736 { 737 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 738 } 739 740 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 741 _mm_sad_epu8(__m128i a, __m128i b) 742 { 743 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 744 } 745 746 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 747 _mm_sub_epi8(__m128i a, __m128i b) 748 { 749 return (__m128i)((__v16qi)a - (__v16qi)b); 750 } 751 752 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 753 _mm_sub_epi16(__m128i a, __m128i b) 754 { 755 return (__m128i)((__v8hi)a - (__v8hi)b); 756 } 757 758 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 759 _mm_sub_epi32(__m128i a, __m128i b) 760 { 761 return (__m128i)((__v4si)a - (__v4si)b); 762 } 763 764 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 765 _mm_sub_si64(__m64 a, __m64 b) 766 { 767 return a - b; 768 } 769 770 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 771 _mm_sub_epi64(__m128i a, __m128i b) 772 { 773 return a - b; 774 } 775 776 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 777 _mm_subs_epi8(__m128i a, __m128i b) 778 { 779 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 780 } 781 782 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 783 _mm_subs_epi16(__m128i a, __m128i b) 784 { 785 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 786 } 787 788 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 789 _mm_subs_epu8(__m128i a, __m128i b) 790 { 791 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 792 } 793 794 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 795 _mm_subs_epu16(__m128i a, __m128i b) 796 { 797 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 798 } 799 800 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 801 _mm_and_si128(__m128i a, __m128i b) 802 { 803 return a & b; 804 } 805 806 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 807 _mm_andnot_si128(__m128i a, __m128i b) 808 { 809 return ~a & b; 810 } 811 812 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 813 _mm_or_si128(__m128i a, __m128i b) 814 { 815 return a | b; 816 } 817 818 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 819 _mm_xor_si128(__m128i a, __m128i b) 820 { 821 return a ^ b; 822 } 823 824 #define _mm_slli_si128(a, count) __extension__ ({ \ 825 __m128i __a = (a); \ 826 (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); }) 827 828 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 829 _mm_slli_epi16(__m128i a, int count) 830 { 831 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 832 } 833 834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 835 _mm_sll_epi16(__m128i a, __m128i count) 836 { 837 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 838 } 839 840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 841 _mm_slli_epi32(__m128i a, int count) 842 { 843 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 844 } 845 846 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 847 _mm_sll_epi32(__m128i a, __m128i count) 848 { 849 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 850 } 851 852 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 853 _mm_slli_epi64(__m128i a, int count) 854 { 855 return __builtin_ia32_psllqi128(a, count); 856 } 857 858 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 859 _mm_sll_epi64(__m128i a, __m128i count) 860 { 861 return __builtin_ia32_psllq128(a, count); 862 } 863 864 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 865 _mm_srai_epi16(__m128i a, int count) 866 { 867 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 868 } 869 870 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 871 _mm_sra_epi16(__m128i a, __m128i count) 872 { 873 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 874 } 875 876 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 877 _mm_srai_epi32(__m128i a, int count) 878 { 879 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 880 } 881 882 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 883 _mm_sra_epi32(__m128i a, __m128i count) 884 { 885 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 886 } 887 888 889 #define _mm_srli_si128(a, count) __extension__ ({ \ 890 __m128i __a = (a); \ 891 (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); }) 892 893 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 894 _mm_srli_epi16(__m128i a, int count) 895 { 896 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 897 } 898 899 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 900 _mm_srl_epi16(__m128i a, __m128i count) 901 { 902 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 903 } 904 905 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 906 _mm_srli_epi32(__m128i a, int count) 907 { 908 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 909 } 910 911 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 912 _mm_srl_epi32(__m128i a, __m128i count) 913 { 914 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 915 } 916 917 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 918 _mm_srli_epi64(__m128i a, int count) 919 { 920 return __builtin_ia32_psrlqi128(a, count); 921 } 922 923 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 924 _mm_srl_epi64(__m128i a, __m128i count) 925 { 926 return __builtin_ia32_psrlq128(a, count); 927 } 928 929 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 930 _mm_cmpeq_epi8(__m128i a, __m128i b) 931 { 932 return (__m128i)((__v16qi)a == (__v16qi)b); 933 } 934 935 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 936 _mm_cmpeq_epi16(__m128i a, __m128i b) 937 { 938 return (__m128i)((__v8hi)a == (__v8hi)b); 939 } 940 941 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 942 _mm_cmpeq_epi32(__m128i a, __m128i b) 943 { 944 return (__m128i)((__v4si)a == (__v4si)b); 945 } 946 947 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 948 _mm_cmpgt_epi8(__m128i a, __m128i b) 949 { 950 /* This function always performs a signed comparison, but __v16qi is a char 951 which may be signed or unsigned. */ 952 typedef signed char __v16qs __attribute__((__vector_size__(16))); 953 return (__m128i)((__v16qs)a > (__v16qs)b); 954 } 955 956 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 957 _mm_cmpgt_epi16(__m128i a, __m128i b) 958 { 959 return (__m128i)((__v8hi)a > (__v8hi)b); 960 } 961 962 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 963 _mm_cmpgt_epi32(__m128i a, __m128i b) 964 { 965 return (__m128i)((__v4si)a > (__v4si)b); 966 } 967 968 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 969 _mm_cmplt_epi8(__m128i a, __m128i b) 970 { 971 return _mm_cmpgt_epi8(b,a); 972 } 973 974 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 975 _mm_cmplt_epi16(__m128i a, __m128i b) 976 { 977 return _mm_cmpgt_epi16(b,a); 978 } 979 980 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 981 _mm_cmplt_epi32(__m128i a, __m128i b) 982 { 983 return _mm_cmpgt_epi32(b,a); 984 } 985 986 #ifdef __x86_64__ 987 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 988 _mm_cvtsi64_sd(__m128d a, long long b) 989 { 990 a[0] = b; 991 return a; 992 } 993 994 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 995 _mm_cvtsd_si64(__m128d a) 996 { 997 return __builtin_ia32_cvtsd2si64(a); 998 } 999 1000 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1001 _mm_cvttsd_si64(__m128d a) 1002 { 1003 return a[0]; 1004 } 1005 #endif 1006 1007 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1008 _mm_cvtepi32_ps(__m128i a) 1009 { 1010 return __builtin_ia32_cvtdq2ps((__v4si)a); 1011 } 1012 1013 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1014 _mm_cvtps_epi32(__m128 a) 1015 { 1016 return (__m128i)__builtin_ia32_cvtps2dq(a); 1017 } 1018 1019 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1020 _mm_cvttps_epi32(__m128 a) 1021 { 1022 return (__m128i)__builtin_ia32_cvttps2dq(a); 1023 } 1024 1025 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1026 _mm_cvtsi32_si128(int a) 1027 { 1028 return (__m128i)(__v4si){ a, 0, 0, 0 }; 1029 } 1030 1031 #ifdef __x86_64__ 1032 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1033 _mm_cvtsi64_si128(long long a) 1034 { 1035 return (__m128i){ a, 0 }; 1036 } 1037 #endif 1038 1039 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1040 _mm_cvtsi128_si32(__m128i a) 1041 { 1042 __v4si b = (__v4si)a; 1043 return b[0]; 1044 } 1045 1046 #ifdef __x86_64__ 1047 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1048 _mm_cvtsi128_si64(__m128i a) 1049 { 1050 return a[0]; 1051 } 1052 #endif 1053 1054 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1055 _mm_load_si128(__m128i const *p) 1056 { 1057 return *p; 1058 } 1059 1060 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1061 _mm_loadu_si128(__m128i const *p) 1062 { 1063 struct __loadu_si128 { 1064 __m128i v; 1065 } __attribute__((packed, may_alias)); 1066 return ((struct __loadu_si128*)p)->v; 1067 } 1068 1069 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1070 _mm_loadl_epi64(__m128i const *p) 1071 { 1072 struct __mm_loadl_epi64_struct { 1073 long long u; 1074 } __attribute__((__packed__, __may_alias__)); 1075 return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0}; 1076 } 1077 1078 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1079 _mm_set_epi64x(long long q1, long long q0) 1080 { 1081 return (__m128i){ q0, q1 }; 1082 } 1083 1084 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1085 _mm_set_epi64(__m64 q1, __m64 q0) 1086 { 1087 return (__m128i){ (long long)q0, (long long)q1 }; 1088 } 1089 1090 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1091 _mm_set_epi32(int i3, int i2, int i1, int i0) 1092 { 1093 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1094 } 1095 1096 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1097 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1098 { 1099 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1100 } 1101 1102 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1103 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1104 { 1105 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1106 } 1107 1108 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1109 _mm_set1_epi64x(long long q) 1110 { 1111 return (__m128i){ q, q }; 1112 } 1113 1114 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1115 _mm_set1_epi64(__m64 q) 1116 { 1117 return (__m128i){ (long long)q, (long long)q }; 1118 } 1119 1120 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1121 _mm_set1_epi32(int i) 1122 { 1123 return (__m128i)(__v4si){ i, i, i, i }; 1124 } 1125 1126 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1127 _mm_set1_epi16(short w) 1128 { 1129 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; 1130 } 1131 1132 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1133 _mm_set1_epi8(char b) 1134 { 1135 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1136 } 1137 1138 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1139 _mm_setr_epi64(__m64 q0, __m64 q1) 1140 { 1141 return (__m128i){ (long long)q0, (long long)q1 }; 1142 } 1143 1144 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1145 _mm_setr_epi32(int i0, int i1, int i2, int i3) 1146 { 1147 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1148 } 1149 1150 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1151 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1152 { 1153 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1154 } 1155 1156 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1157 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1158 { 1159 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1160 } 1161 1162 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1163 _mm_setzero_si128(void) 1164 { 1165 return (__m128i){ 0LL, 0LL }; 1166 } 1167 1168 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1169 _mm_store_si128(__m128i *p, __m128i b) 1170 { 1171 *p = b; 1172 } 1173 1174 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1175 _mm_storeu_si128(__m128i *p, __m128i b) 1176 { 1177 __builtin_ia32_storedqu((char *)p, (__v16qi)b); 1178 } 1179 1180 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1181 _mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 1182 { 1183 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 1184 } 1185 1186 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1187 _mm_storel_epi64(__m128i *p, __m128i a) 1188 { 1189 __builtin_ia32_storelv4si((__v2si *)p, a); 1190 } 1191 1192 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1193 _mm_stream_pd(double *p, __m128d a) 1194 { 1195 __builtin_ia32_movntpd(p, a); 1196 } 1197 1198 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1199 _mm_stream_si128(__m128i *p, __m128i a) 1200 { 1201 __builtin_ia32_movntdq(p, a); 1202 } 1203 1204 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1205 _mm_stream_si32(int *p, int a) 1206 { 1207 __builtin_ia32_movnti(p, a); 1208 } 1209 1210 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1211 _mm_clflush(void const *p) 1212 { 1213 __builtin_ia32_clflush(p); 1214 } 1215 1216 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1217 _mm_lfence(void) 1218 { 1219 __builtin_ia32_lfence(); 1220 } 1221 1222 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1223 _mm_mfence(void) 1224 { 1225 __builtin_ia32_mfence(); 1226 } 1227 1228 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1229 _mm_packs_epi16(__m128i a, __m128i b) 1230 { 1231 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 1232 } 1233 1234 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1235 _mm_packs_epi32(__m128i a, __m128i b) 1236 { 1237 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 1238 } 1239 1240 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1241 _mm_packus_epi16(__m128i a, __m128i b) 1242 { 1243 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 1244 } 1245 1246 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1247 _mm_extract_epi16(__m128i a, int imm) 1248 { 1249 __v8hi b = (__v8hi)a; 1250 return (unsigned short)b[imm]; 1251 } 1252 1253 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1254 _mm_insert_epi16(__m128i a, int b, int imm) 1255 { 1256 __v8hi c = (__v8hi)a; 1257 c[imm & 7] = b; 1258 return (__m128i)c; 1259 } 1260 1261 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1262 _mm_movemask_epi8(__m128i a) 1263 { 1264 return __builtin_ia32_pmovmskb128((__v16qi)a); 1265 } 1266 1267 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 1268 __m128i __a = (a); \ 1269 (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \ 1270 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1271 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) 1272 1273 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 1274 __m128i __a = (a); \ 1275 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 1276 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1277 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 1278 4, 5, 6, 7); }) 1279 1280 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 1281 __m128i __a = (a); \ 1282 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 1283 0, 1, 2, 3, \ 1284 4 + (((imm) & 0x03) >> 0), \ 1285 4 + (((imm) & 0x0c) >> 2), \ 1286 4 + (((imm) & 0x30) >> 4), \ 1287 4 + (((imm) & 0xc0) >> 6)); }) 1288 1289 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1290 _mm_unpackhi_epi8(__m128i a, __m128i b) 1291 { 1292 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1293 } 1294 1295 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1296 _mm_unpackhi_epi16(__m128i a, __m128i b) 1297 { 1298 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1299 } 1300 1301 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1302 _mm_unpackhi_epi32(__m128i a, __m128i b) 1303 { 1304 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); 1305 } 1306 1307 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1308 _mm_unpackhi_epi64(__m128i a, __m128i b) 1309 { 1310 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); 1311 } 1312 1313 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1314 _mm_unpacklo_epi8(__m128i a, __m128i b) 1315 { 1316 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1317 } 1318 1319 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1320 _mm_unpacklo_epi16(__m128i a, __m128i b) 1321 { 1322 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1323 } 1324 1325 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1326 _mm_unpacklo_epi32(__m128i a, __m128i b) 1327 { 1328 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); 1329 } 1330 1331 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1332 _mm_unpacklo_epi64(__m128i a, __m128i b) 1333 { 1334 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); 1335 } 1336 1337 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 1338 _mm_movepi64_pi64(__m128i a) 1339 { 1340 return (__m64)a[0]; 1341 } 1342 1343 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1344 _mm_movpi64_pi64(__m64 a) 1345 { 1346 return (__m128i){ (long long)a, 0 }; 1347 } 1348 1349 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1350 _mm_move_epi64(__m128i a) 1351 { 1352 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); 1353 } 1354 1355 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1356 _mm_unpackhi_pd(__m128d a, __m128d b) 1357 { 1358 return __builtin_shufflevector(a, b, 1, 2+1); 1359 } 1360 1361 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1362 _mm_unpacklo_pd(__m128d a, __m128d b) 1363 { 1364 return __builtin_shufflevector(a, b, 0, 2+0); 1365 } 1366 1367 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1368 _mm_movemask_pd(__m128d a) 1369 { 1370 return __builtin_ia32_movmskpd(a); 1371 } 1372 1373 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 1374 __m128d __a = (a); \ 1375 __m128d __b = (b); \ 1376 __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); }) 1377 1378 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1379 _mm_castpd_ps(__m128d in) 1380 { 1381 return (__m128)in; 1382 } 1383 1384 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1385 _mm_castpd_si128(__m128d in) 1386 { 1387 return (__m128i)in; 1388 } 1389 1390 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1391 _mm_castps_pd(__m128 in) 1392 { 1393 return (__m128d)in; 1394 } 1395 1396 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1397 _mm_castps_si128(__m128 in) 1398 { 1399 return (__m128i)in; 1400 } 1401 1402 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1403 _mm_castsi128_ps(__m128i in) 1404 { 1405 return (__m128)in; 1406 } 1407 1408 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1409 _mm_castsi128_pd(__m128i in) 1410 { 1411 return (__m128d)in; 1412 } 1413 1414 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1415 _mm_pause(void) 1416 { 1417 __asm__ volatile ("pause"); 1418 } 1419 1420 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1421 1422 #endif /* __SSE2__ */ 1423 1424 #endif /* __EMMINTRIN_H */ 1425