1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __EMMINTRIN_H 25 #define __EMMINTRIN_H 26 27 #include <xmmintrin.h> 28 29 typedef double __m128d __attribute__((__vector_size__(16))); 30 typedef long long __m128i __attribute__((__vector_size__(16))); 31 32 /* Type defines. */ 33 typedef double __v2df __attribute__ ((__vector_size__ (16))); 34 typedef long long __v2di __attribute__ ((__vector_size__ (16))); 35 typedef short __v8hi __attribute__((__vector_size__(16))); 36 typedef char __v16qi __attribute__((__vector_size__(16))); 37 38 /* Unsigned types */ 39 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); 40 typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 41 typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 42 43 /* We need an explicitly signed variant for char. Note that this shouldn't 44 * appear in the interface though. */ 45 typedef signed char __v16qs __attribute__((__vector_size__(16))); 46 47 #include <f16cintrin.h> 48 49 /* Define the default attributes for the functions in this file. */ 50 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 51 52 /// \brief Adds lower double-precision values in both operands and returns the 53 /// sum in the lower 64 bits of the result. The upper 64 bits of the result 54 /// are copied from the upper double-precision value of the first operand. 55 /// 56 /// \headerfile <x86intrin.h> 57 /// 58 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction. 59 /// 60 /// \param __a 61 /// A 128-bit vector of [2 x double] containing one of the source operands. 62 /// \param __b 63 /// A 128-bit vector of [2 x double] containing one of the source operands. 64 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 65 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied 66 /// from the upper 64 bits of the first source operand. 67 static __inline__ __m128d __DEFAULT_FN_ATTRS 68 _mm_add_sd(__m128d __a, __m128d __b) 69 { 70 __a[0] += __b[0]; 71 return __a; 72 } 73 74 /// \brief Adds two 128-bit vectors of [2 x double]. 75 /// 76 /// \headerfile <x86intrin.h> 77 /// 78 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction. 79 /// 80 /// \param __a 81 /// A 128-bit vector of [2 x double] containing one of the source operands. 82 /// \param __b 83 /// A 128-bit vector of [2 x double] containing one of the source operands. 84 /// \returns A 128-bit vector of [2 x double] containing the sums of both 85 /// operands. 86 static __inline__ __m128d __DEFAULT_FN_ATTRS 87 _mm_add_pd(__m128d __a, __m128d __b) 88 { 89 return (__m128d)((__v2df)__a + (__v2df)__b); 90 } 91 92 /// \brief Subtracts the lower double-precision value of the second operand 93 /// from the lower double-precision value of the first operand and returns 94 /// the difference in the lower 64 bits of the result. The upper 64 bits of 95 /// the result are copied from the upper double-precision value of the first 96 /// operand. 97 /// 98 /// \headerfile <x86intrin.h> 99 /// 100 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction. 101 /// 102 /// \param __a 103 /// A 128-bit vector of [2 x double] containing the minuend. 104 /// \param __b 105 /// A 128-bit vector of [2 x double] containing the subtrahend. 106 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 107 /// difference of the lower 64 bits of both operands. The upper 64 bits are 108 /// copied from the upper 64 bits of the first source operand. 109 static __inline__ __m128d __DEFAULT_FN_ATTRS 110 _mm_sub_sd(__m128d __a, __m128d __b) 111 { 112 __a[0] -= __b[0]; 113 return __a; 114 } 115 116 /// \brief Subtracts two 128-bit vectors of [2 x double]. 117 /// 118 /// \headerfile <x86intrin.h> 119 /// 120 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction. 121 /// 122 /// \param __a 123 /// A 128-bit vector of [2 x double] containing the minuend. 124 /// \param __b 125 /// A 128-bit vector of [2 x double] containing the subtrahend. 126 /// \returns A 128-bit vector of [2 x double] containing the differences between 127 /// both operands. 128 static __inline__ __m128d __DEFAULT_FN_ATTRS 129 _mm_sub_pd(__m128d __a, __m128d __b) 130 { 131 return (__m128d)((__v2df)__a - (__v2df)__b); 132 } 133 134 /// \brief Multiplies lower double-precision values in both operands and returns 135 /// the product in the lower 64 bits of the result. The upper 64 bits of the 136 /// result are copied from the upper double-precision value of the first 137 /// operand. 138 /// 139 /// \headerfile <x86intrin.h> 140 /// 141 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction. 142 /// 143 /// \param __a 144 /// A 128-bit vector of [2 x double] containing one of the source operands. 145 /// \param __b 146 /// A 128-bit vector of [2 x double] containing one of the source operands. 147 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 148 /// product of the lower 64 bits of both operands. The upper 64 bits are 149 /// copied from the upper 64 bits of the first source operand. 150 static __inline__ __m128d __DEFAULT_FN_ATTRS 151 _mm_mul_sd(__m128d __a, __m128d __b) 152 { 153 __a[0] *= __b[0]; 154 return __a; 155 } 156 157 /// \brief Multiplies two 128-bit vectors of [2 x double]. 158 /// 159 /// \headerfile <x86intrin.h> 160 /// 161 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction. 162 /// 163 /// \param __a 164 /// A 128-bit vector of [2 x double] containing one of the operands. 165 /// \param __b 166 /// A 128-bit vector of [2 x double] containing one of the operands. 167 /// \returns A 128-bit vector of [2 x double] containing the products of both 168 /// operands. 169 static __inline__ __m128d __DEFAULT_FN_ATTRS 170 _mm_mul_pd(__m128d __a, __m128d __b) 171 { 172 return (__m128d)((__v2df)__a * (__v2df)__b); 173 } 174 175 /// \brief Divides the lower double-precision value of the first operand by the 176 /// lower double-precision value of the second operand and returns the 177 /// quotient in the lower 64 bits of the result. The upper 64 bits of the 178 /// result are copied from the upper double-precision value of the first 179 /// operand. 180 /// 181 /// \headerfile <x86intrin.h> 182 /// 183 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction. 184 /// 185 /// \param __a 186 /// A 128-bit vector of [2 x double] containing the dividend. 187 /// \param __b 188 /// A 128-bit vector of [2 x double] containing divisor. 189 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 190 /// quotient of the lower 64 bits of both operands. The upper 64 bits are 191 /// copied from the upper 64 bits of the first source operand. 192 static __inline__ __m128d __DEFAULT_FN_ATTRS 193 _mm_div_sd(__m128d __a, __m128d __b) 194 { 195 __a[0] /= __b[0]; 196 return __a; 197 } 198 199 /// \brief Performs an element-by-element division of two 128-bit vectors of 200 /// [2 x double]. 201 /// 202 /// \headerfile <x86intrin.h> 203 /// 204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction. 205 /// 206 /// \param __a 207 /// A 128-bit vector of [2 x double] containing the dividend. 208 /// \param __b 209 /// A 128-bit vector of [2 x double] containing the divisor. 210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both 211 /// operands. 212 static __inline__ __m128d __DEFAULT_FN_ATTRS 213 _mm_div_pd(__m128d __a, __m128d __b) 214 { 215 return (__m128d)((__v2df)__a / (__v2df)__b); 216 } 217 218 /// \brief Calculates the square root of the lower double-precision value of 219 /// the second operand and returns it in the lower 64 bits of the result. 220 /// The upper 64 bits of the result are copied from the upper double- 221 /// precision value of the first operand. 222 /// 223 /// \headerfile <x86intrin.h> 224 /// 225 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction. 226 /// 227 /// \param __a 228 /// A 128-bit vector of [2 x double] containing one of the operands. The 229 /// upper 64 bits of this operand are copied to the upper 64 bits of the 230 /// result. 231 /// \param __b 232 /// A 128-bit vector of [2 x double] containing one of the operands. The 233 /// square root is calculated using the lower 64 bits of this operand. 234 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 235 /// square root of the lower 64 bits of operand \a __b, and whose upper 64 236 /// bits are copied from the upper 64 bits of operand \a __a. 237 static __inline__ __m128d __DEFAULT_FN_ATTRS 238 _mm_sqrt_sd(__m128d __a, __m128d __b) 239 { 240 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 241 return (__m128d) { __c[0], __a[1] }; 242 } 243 244 /// \brief Calculates the square root of the each of two values stored in a 245 /// 128-bit vector of [2 x double]. 246 /// 247 /// \headerfile <x86intrin.h> 248 /// 249 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction. 250 /// 251 /// \param __a 252 /// A 128-bit vector of [2 x double]. 253 /// \returns A 128-bit vector of [2 x double] containing the square roots of the 254 /// values in the operand. 255 static __inline__ __m128d __DEFAULT_FN_ATTRS 256 _mm_sqrt_pd(__m128d __a) 257 { 258 return __builtin_ia32_sqrtpd((__v2df)__a); 259 } 260 261 /// \brief Compares lower 64-bit double-precision values of both operands, and 262 /// returns the lesser of the pair of values in the lower 64-bits of the 263 /// result. The upper 64 bits of the result are copied from the upper double- 264 /// precision value of the first operand. 265 /// 266 /// \headerfile <x86intrin.h> 267 /// 268 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction. 269 /// 270 /// \param __a 271 /// A 128-bit vector of [2 x double] containing one of the operands. The 272 /// lower 64 bits of this operand are used in the comparison. 273 /// \param __b 274 /// A 128-bit vector of [2 x double] containing one of the operands. The 275 /// lower 64 bits of this operand are used in the comparison. 276 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 277 /// minimum value between both operands. The upper 64 bits are copied from 278 /// the upper 64 bits of the first source operand. 279 static __inline__ __m128d __DEFAULT_FN_ATTRS 280 _mm_min_sd(__m128d __a, __m128d __b) 281 { 282 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 283 } 284 285 /// \brief Performs element-by-element comparison of the two 128-bit vectors of 286 /// [2 x double] and returns the vector containing the lesser of each pair of 287 /// values. 288 /// 289 /// \headerfile <x86intrin.h> 290 /// 291 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction. 292 /// 293 /// \param __a 294 /// A 128-bit vector of [2 x double] containing one of the operands. 295 /// \param __b 296 /// A 128-bit vector of [2 x double] containing one of the operands. 297 /// \returns A 128-bit vector of [2 x double] containing the minimum values 298 /// between both operands. 299 static __inline__ __m128d __DEFAULT_FN_ATTRS 300 _mm_min_pd(__m128d __a, __m128d __b) 301 { 302 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 303 } 304 305 /// \brief Compares lower 64-bit double-precision values of both operands, and 306 /// returns the greater of the pair of values in the lower 64-bits of the 307 /// result. The upper 64 bits of the result are copied from the upper double- 308 /// precision value of the first operand. 309 /// 310 /// \headerfile <x86intrin.h> 311 /// 312 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction. 313 /// 314 /// \param __a 315 /// A 128-bit vector of [2 x double] containing one of the operands. The 316 /// lower 64 bits of this operand are used in the comparison. 317 /// \param __b 318 /// A 128-bit vector of [2 x double] containing one of the operands. The 319 /// lower 64 bits of this operand are used in the comparison. 320 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 321 /// maximum value between both operands. The upper 64 bits are copied from 322 /// the upper 64 bits of the first source operand. 323 static __inline__ __m128d __DEFAULT_FN_ATTRS 324 _mm_max_sd(__m128d __a, __m128d __b) 325 { 326 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 327 } 328 329 /// \brief Performs element-by-element comparison of the two 128-bit vectors of 330 /// [2 x double] and returns the vector containing the greater of each pair 331 /// of values. 332 /// 333 /// \headerfile <x86intrin.h> 334 /// 335 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction. 336 /// 337 /// \param __a 338 /// A 128-bit vector of [2 x double] containing one of the operands. 339 /// \param __b 340 /// A 128-bit vector of [2 x double] containing one of the operands. 341 /// \returns A 128-bit vector of [2 x double] containing the maximum values 342 /// between both operands. 343 static __inline__ __m128d __DEFAULT_FN_ATTRS 344 _mm_max_pd(__m128d __a, __m128d __b) 345 { 346 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 347 } 348 349 /// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double]. 350 /// 351 /// \headerfile <x86intrin.h> 352 /// 353 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 354 /// 355 /// \param __a 356 /// A 128-bit vector of [2 x double] containing one of the source operands. 357 /// \param __b 358 /// A 128-bit vector of [2 x double] containing one of the source operands. 359 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 360 /// values between both operands. 361 static __inline__ __m128d __DEFAULT_FN_ATTRS 362 _mm_and_pd(__m128d __a, __m128d __b) 363 { 364 return (__m128d)((__v2du)__a & (__v2du)__b); 365 } 366 367 /// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using 368 /// the one's complement of the values contained in the first source operand. 369 /// 370 /// \headerfile <x86intrin.h> 371 /// 372 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 373 /// 374 /// \param __a 375 /// A 128-bit vector of [2 x double] containing the left source operand. The 376 /// one's complement of this value is used in the bitwise AND. 377 /// \param __b 378 /// A 128-bit vector of [2 x double] containing the right source operand. 379 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 380 /// values in the second operand and the one's complement of the first 381 /// operand. 382 static __inline__ __m128d __DEFAULT_FN_ATTRS 383 _mm_andnot_pd(__m128d __a, __m128d __b) 384 { 385 return (__m128d)(~(__v2du)__a & (__v2du)__b); 386 } 387 388 /// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double]. 389 /// 390 /// \headerfile <x86intrin.h> 391 /// 392 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 393 /// 394 /// \param __a 395 /// A 128-bit vector of [2 x double] containing one of the source operands. 396 /// \param __b 397 /// A 128-bit vector of [2 x double] containing one of the source operands. 398 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the 399 /// values between both operands. 400 static __inline__ __m128d __DEFAULT_FN_ATTRS 401 _mm_or_pd(__m128d __a, __m128d __b) 402 { 403 return (__m128d)((__v2du)__a | (__v2du)__b); 404 } 405 406 /// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double]. 407 /// 408 /// \headerfile <x86intrin.h> 409 /// 410 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 411 /// 412 /// \param __a 413 /// A 128-bit vector of [2 x double] containing one of the source operands. 414 /// \param __b 415 /// A 128-bit vector of [2 x double] containing one of the source operands. 416 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the 417 /// values between both operands. 418 static __inline__ __m128d __DEFAULT_FN_ATTRS 419 _mm_xor_pd(__m128d __a, __m128d __b) 420 { 421 return (__m128d)((__v2du)__a ^ (__v2du)__b); 422 } 423 424 /// \brief Compares each of the corresponding double-precision values of the 425 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0h 426 /// for false, FFFFFFFFFFFFFFFFh for true. 427 /// 428 /// \headerfile <x86intrin.h> 429 /// 430 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction. 431 /// 432 /// \param __a 433 /// A 128-bit vector of [2 x double]. 434 /// \param __b 435 /// A 128-bit vector of [2 x double]. 436 /// \returns A 128-bit vector containing the comparison results. 437 static __inline__ __m128d __DEFAULT_FN_ATTRS 438 _mm_cmpeq_pd(__m128d __a, __m128d __b) 439 { 440 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 441 } 442 443 /// \brief Compares each of the corresponding double-precision values of the 444 /// 128-bit vectors of [2 x double] to determine if the values in the first 445 /// operand are less than those in the second operand. Each comparison 446 /// yields 0h for false, FFFFFFFFFFFFFFFFh for true. 447 /// 448 /// \headerfile <x86intrin.h> 449 /// 450 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 451 /// 452 /// \param __a 453 /// A 128-bit vector of [2 x double]. 454 /// \param __b 455 /// A 128-bit vector of [2 x double]. 456 /// \returns A 128-bit vector containing the comparison results. 457 static __inline__ __m128d __DEFAULT_FN_ATTRS 458 _mm_cmplt_pd(__m128d __a, __m128d __b) 459 { 460 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 461 } 462 463 /// \brief Compares each of the corresponding double-precision values of the 464 /// 128-bit vectors of [2 x double] to determine if the values in the first 465 /// operand are less than or equal to those in the second operand. 466 /// 467 /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 468 /// 469 /// \headerfile <x86intrin.h> 470 /// 471 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 472 /// 473 /// \param __a 474 /// A 128-bit vector of [2 x double]. 475 /// \param __b 476 /// A 128-bit vector of [2 x double]. 477 /// \returns A 128-bit vector containing the comparison results. 478 static __inline__ __m128d __DEFAULT_FN_ATTRS 479 _mm_cmple_pd(__m128d __a, __m128d __b) 480 { 481 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 482 } 483 484 /// \brief Compares each of the corresponding double-precision values of the 485 /// 128-bit vectors of [2 x double] to determine if the values in the first 486 /// operand are greater than those in the second operand. 487 /// 488 /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 489 /// 490 /// \headerfile <x86intrin.h> 491 /// 492 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 493 /// 494 /// \param __a 495 /// A 128-bit vector of [2 x double]. 496 /// \param __b 497 /// A 128-bit vector of [2 x double]. 498 /// \returns A 128-bit vector containing the comparison results. 499 static __inline__ __m128d __DEFAULT_FN_ATTRS 500 _mm_cmpgt_pd(__m128d __a, __m128d __b) 501 { 502 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 503 } 504 505 /// \brief Compares each of the corresponding double-precision values of the 506 /// 128-bit vectors of [2 x double] to determine if the values in the first 507 /// operand are greater than or equal to those in the second operand. 508 /// 509 /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 510 /// 511 /// \headerfile <x86intrin.h> 512 /// 513 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 514 /// 515 /// \param __a 516 /// A 128-bit vector of [2 x double]. 517 /// \param __b 518 /// A 128-bit vector of [2 x double]. 519 /// \returns A 128-bit vector containing the comparison results. 520 static __inline__ __m128d __DEFAULT_FN_ATTRS 521 _mm_cmpge_pd(__m128d __a, __m128d __b) 522 { 523 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 524 } 525 526 /// \brief Compares each of the corresponding double-precision values of the 527 /// 128-bit vectors of [2 x double] to determine if the values in the first 528 /// operand are ordered with respect to those in the second operand. 529 /// 530 /// A pair of double-precision values are "ordered" with respect to each 531 /// other if neither value is a NaN. Each comparison yields 0h for false, 532 /// FFFFFFFFFFFFFFFFh for true. 533 /// 534 /// \headerfile <x86intrin.h> 535 /// 536 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction. 537 /// 538 /// \param __a 539 /// A 128-bit vector of [2 x double]. 540 /// \param __b 541 /// A 128-bit vector of [2 x double]. 542 /// \returns A 128-bit vector containing the comparison results. 543 static __inline__ __m128d __DEFAULT_FN_ATTRS 544 _mm_cmpord_pd(__m128d __a, __m128d __b) 545 { 546 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 547 } 548 549 /// \brief Compares each of the corresponding double-precision values of the 550 /// 128-bit vectors of [2 x double] to determine if the values in the first 551 /// operand are unordered with respect to those in the second operand. 552 /// 553 /// A pair of double-precision values are "unordered" with respect to each 554 /// other if one or both values are NaN. Each comparison yields 0h for false, 555 /// FFFFFFFFFFFFFFFFh for true. 556 /// 557 /// \headerfile <x86intrin.h> 558 /// 559 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c> 560 /// instruction. 561 /// 562 /// \param __a 563 /// A 128-bit vector of [2 x double]. 564 /// \param __b 565 /// A 128-bit vector of [2 x double]. 566 /// \returns A 128-bit vector containing the comparison results. 567 static __inline__ __m128d __DEFAULT_FN_ATTRS 568 _mm_cmpunord_pd(__m128d __a, __m128d __b) 569 { 570 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 571 } 572 573 /// \brief Compares each of the corresponding double-precision values of the 574 /// 128-bit vectors of [2 x double] to determine if the values in the first 575 /// operand are unequal to those in the second operand. 576 /// 577 /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 578 /// 579 /// \headerfile <x86intrin.h> 580 /// 581 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction. 582 /// 583 /// \param __a 584 /// A 128-bit vector of [2 x double]. 585 /// \param __b 586 /// A 128-bit vector of [2 x double]. 587 /// \returns A 128-bit vector containing the comparison results. 588 static __inline__ __m128d __DEFAULT_FN_ATTRS 589 _mm_cmpneq_pd(__m128d __a, __m128d __b) 590 { 591 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 592 } 593 594 /// \brief Compares each of the corresponding double-precision values of the 595 /// 128-bit vectors of [2 x double] to determine if the values in the first 596 /// operand are not less than those in the second operand. 597 /// 598 /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 599 /// 600 /// \headerfile <x86intrin.h> 601 /// 602 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 603 /// 604 /// \param __a 605 /// A 128-bit vector of [2 x double]. 606 /// \param __b 607 /// A 128-bit vector of [2 x double]. 608 /// \returns A 128-bit vector containing the comparison results. 609 static __inline__ __m128d __DEFAULT_FN_ATTRS 610 _mm_cmpnlt_pd(__m128d __a, __m128d __b) 611 { 612 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 613 } 614 615 /// \brief Compares each of the corresponding double-precision values of the 616 /// 128-bit vectors of [2 x double] to determine if the values in the first 617 /// operand are not less than or equal to those in the second operand. 618 /// 619 /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 620 /// 621 /// \headerfile <x86intrin.h> 622 /// 623 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 624 /// 625 /// \param __a 626 /// A 128-bit vector of [2 x double]. 627 /// \param __b 628 /// A 128-bit vector of [2 x double]. 629 /// \returns A 128-bit vector containing the comparison results. 630 static __inline__ __m128d __DEFAULT_FN_ATTRS 631 _mm_cmpnle_pd(__m128d __a, __m128d __b) 632 { 633 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 634 } 635 636 /// \brief Compares each of the corresponding double-precision values of the 637 /// 128-bit vectors of [2 x double] to determine if the values in the first 638 /// operand are not greater than those in the second operand. 639 /// 640 /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 641 /// 642 /// \headerfile <x86intrin.h> 643 /// 644 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 645 /// 646 /// \param __a 647 /// A 128-bit vector of [2 x double]. 648 /// \param __b 649 /// A 128-bit vector of [2 x double]. 650 /// \returns A 128-bit vector containing the comparison results. 651 static __inline__ __m128d __DEFAULT_FN_ATTRS 652 _mm_cmpngt_pd(__m128d __a, __m128d __b) 653 { 654 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 655 } 656 657 /// \brief Compares each of the corresponding double-precision values of the 658 /// 128-bit vectors of [2 x double] to determine if the values in the first 659 /// operand are not greater than or equal to those in the second operand. 660 /// 661 /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 662 /// 663 /// \headerfile <x86intrin.h> 664 /// 665 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 666 /// 667 /// \param __a 668 /// A 128-bit vector of [2 x double]. 669 /// \param __b 670 /// A 128-bit vector of [2 x double]. 671 /// \returns A 128-bit vector containing the comparison results. 672 static __inline__ __m128d __DEFAULT_FN_ATTRS 673 _mm_cmpnge_pd(__m128d __a, __m128d __b) 674 { 675 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 676 } 677 678 /// \brief Compares the lower double-precision floating-point values in each of 679 /// the two 128-bit floating-point vectors of [2 x double] for equality. 680 /// 681 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 682 /// 683 /// \headerfile <x86intrin.h> 684 /// 685 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction. 686 /// 687 /// \param __a 688 /// A 128-bit vector of [2 x double]. The lower double-precision value is 689 /// compared to the lower double-precision value of \a __b. 690 /// \param __b 691 /// A 128-bit vector of [2 x double]. The lower double-precision value is 692 /// compared to the lower double-precision value of \a __a. 693 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 694 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 695 static __inline__ __m128d __DEFAULT_FN_ATTRS 696 _mm_cmpeq_sd(__m128d __a, __m128d __b) 697 { 698 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 699 } 700 701 /// \brief Compares the lower double-precision floating-point values in each of 702 /// the two 128-bit floating-point vectors of [2 x double] to determine if 703 /// the value in the first parameter is less than the corresponding value in 704 /// the second parameter. 705 /// 706 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 707 /// 708 /// \headerfile <x86intrin.h> 709 /// 710 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 711 /// 712 /// \param __a 713 /// A 128-bit vector of [2 x double]. The lower double-precision value is 714 /// compared to the lower double-precision value of \a __b. 715 /// \param __b 716 /// A 128-bit vector of [2 x double]. The lower double-precision value is 717 /// compared to the lower double-precision value of \a __a. 718 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 719 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 720 static __inline__ __m128d __DEFAULT_FN_ATTRS 721 _mm_cmplt_sd(__m128d __a, __m128d __b) 722 { 723 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 724 } 725 726 /// \brief Compares the lower double-precision floating-point values in each of 727 /// the two 128-bit floating-point vectors of [2 x double] to determine if 728 /// the value in the first parameter is less than or equal to the 729 /// corresponding value in the second parameter. 730 /// 731 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 732 /// 733 /// \headerfile <x86intrin.h> 734 /// 735 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 736 /// 737 /// \param __a 738 /// A 128-bit vector of [2 x double]. The lower double-precision value is 739 /// compared to the lower double-precision value of \a __b. 740 /// \param __b 741 /// A 128-bit vector of [2 x double]. The lower double-precision value is 742 /// compared to the lower double-precision value of \a __a. 743 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 744 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 745 static __inline__ __m128d __DEFAULT_FN_ATTRS 746 _mm_cmple_sd(__m128d __a, __m128d __b) 747 { 748 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 749 } 750 751 /// \brief Compares the lower double-precision floating-point values in each of 752 /// the two 128-bit floating-point vectors of [2 x double] to determine if 753 /// the value in the first parameter is greater than the corresponding value 754 /// in the second parameter. 755 /// 756 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 757 /// 758 /// \headerfile <x86intrin.h> 759 /// 760 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 761 /// 762 /// \param __a 763 /// A 128-bit vector of [2 x double]. The lower double-precision value is 764 /// compared to the lower double-precision value of \a __b. 765 /// \param __b 766 /// A 128-bit vector of [2 x double]. The lower double-precision value is 767 /// compared to the lower double-precision value of \a __a. 768 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 769 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 770 static __inline__ __m128d __DEFAULT_FN_ATTRS 771 _mm_cmpgt_sd(__m128d __a, __m128d __b) 772 { 773 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 774 return (__m128d) { __c[0], __a[1] }; 775 } 776 777 /// \brief Compares the lower double-precision floating-point values in each of 778 /// the two 128-bit floating-point vectors of [2 x double] to determine if 779 /// the value in the first parameter is greater than or equal to the 780 /// corresponding value in the second parameter. 781 /// 782 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 783 /// 784 /// \headerfile <x86intrin.h> 785 /// 786 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 787 /// 788 /// \param __a 789 /// A 128-bit vector of [2 x double]. The lower double-precision value is 790 /// compared to the lower double-precision value of \a __b. 791 /// \param __b 792 /// A 128-bit vector of [2 x double]. The lower double-precision value is 793 /// compared to the lower double-precision value of \a __a. 794 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 795 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 796 static __inline__ __m128d __DEFAULT_FN_ATTRS 797 _mm_cmpge_sd(__m128d __a, __m128d __b) 798 { 799 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 800 return (__m128d) { __c[0], __a[1] }; 801 } 802 803 /// \brief Compares the lower double-precision floating-point values in each of 804 /// the two 128-bit floating-point vectors of [2 x double] to determine if 805 /// the value in the first parameter is "ordered" with respect to the 806 /// corresponding value in the second parameter. 807 /// 808 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of 809 /// double-precision values are "ordered" with respect to each other if 810 /// neither value is a NaN. 811 /// 812 /// \headerfile <x86intrin.h> 813 /// 814 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction. 815 /// 816 /// \param __a 817 /// A 128-bit vector of [2 x double]. The lower double-precision value is 818 /// compared to the lower double-precision value of \a __b. 819 /// \param __b 820 /// A 128-bit vector of [2 x double]. The lower double-precision value is 821 /// compared to the lower double-precision value of \a __a. 822 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 823 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 824 static __inline__ __m128d __DEFAULT_FN_ATTRS 825 _mm_cmpord_sd(__m128d __a, __m128d __b) 826 { 827 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 828 } 829 830 /// \brief Compares the lower double-precision floating-point values in each of 831 /// the two 128-bit floating-point vectors of [2 x double] to determine if 832 /// the value in the first parameter is "unordered" with respect to the 833 /// corresponding value in the second parameter. 834 /// 835 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of 836 /// double-precision values are "unordered" with respect to each other if one 837 /// or both values are NaN. 838 /// 839 /// \headerfile <x86intrin.h> 840 /// 841 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c> 842 /// instruction. 843 /// 844 /// \param __a 845 /// A 128-bit vector of [2 x double]. The lower double-precision value is 846 /// compared to the lower double-precision value of \a __b. 847 /// \param __b 848 /// A 128-bit vector of [2 x double]. The lower double-precision value is 849 /// compared to the lower double-precision value of \a __a. 850 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 851 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 852 static __inline__ __m128d __DEFAULT_FN_ATTRS 853 _mm_cmpunord_sd(__m128d __a, __m128d __b) 854 { 855 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 856 } 857 858 /// \brief Compares the lower double-precision floating-point values in each of 859 /// the two 128-bit floating-point vectors of [2 x double] to determine if 860 /// the value in the first parameter is unequal to the corresponding value in 861 /// the second parameter. 862 /// 863 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 864 /// 865 /// \headerfile <x86intrin.h> 866 /// 867 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction. 868 /// 869 /// \param __a 870 /// A 128-bit vector of [2 x double]. The lower double-precision value is 871 /// compared to the lower double-precision value of \a __b. 872 /// \param __b 873 /// A 128-bit vector of [2 x double]. The lower double-precision value is 874 /// compared to the lower double-precision value of \a __a. 875 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 876 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 877 static __inline__ __m128d __DEFAULT_FN_ATTRS 878 _mm_cmpneq_sd(__m128d __a, __m128d __b) 879 { 880 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 881 } 882 883 /// \brief Compares the lower double-precision floating-point values in each of 884 /// the two 128-bit floating-point vectors of [2 x double] to determine if 885 /// the value in the first parameter is not less than the corresponding 886 /// value in the second parameter. 887 /// 888 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 889 /// 890 /// \headerfile <x86intrin.h> 891 /// 892 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 893 /// 894 /// \param __a 895 /// A 128-bit vector of [2 x double]. The lower double-precision value is 896 /// compared to the lower double-precision value of \a __b. 897 /// \param __b 898 /// A 128-bit vector of [2 x double]. The lower double-precision value is 899 /// compared to the lower double-precision value of \a __a. 900 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 901 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 902 static __inline__ __m128d __DEFAULT_FN_ATTRS 903 _mm_cmpnlt_sd(__m128d __a, __m128d __b) 904 { 905 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 906 } 907 908 /// \brief Compares the lower double-precision floating-point values in each of 909 /// the two 128-bit floating-point vectors of [2 x double] to determine if 910 /// the value in the first parameter is not less than or equal to the 911 /// corresponding value in the second parameter. 912 /// 913 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 914 /// 915 /// \headerfile <x86intrin.h> 916 /// 917 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 918 /// 919 /// \param __a 920 /// A 128-bit vector of [2 x double]. The lower double-precision value is 921 /// compared to the lower double-precision value of \a __b. 922 /// \param __b 923 /// A 128-bit vector of [2 x double]. The lower double-precision value is 924 /// compared to the lower double-precision value of \a __a. 925 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 926 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 927 static __inline__ __m128d __DEFAULT_FN_ATTRS 928 _mm_cmpnle_sd(__m128d __a, __m128d __b) 929 { 930 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 931 } 932 933 /// \brief Compares the lower double-precision floating-point values in each of 934 /// the two 128-bit floating-point vectors of [2 x double] to determine if 935 /// the value in the first parameter is not greater than the corresponding 936 /// value in the second parameter. 937 /// 938 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 939 /// 940 /// \headerfile <x86intrin.h> 941 /// 942 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 943 /// 944 /// \param __a 945 /// A 128-bit vector of [2 x double]. The lower double-precision value is 946 /// compared to the lower double-precision value of \a __b. 947 /// \param __b 948 /// A 128-bit vector of [2 x double]. The lower double-precision value is 949 /// compared to the lower double-precision value of \a __a. 950 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 951 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 952 static __inline__ __m128d __DEFAULT_FN_ATTRS 953 _mm_cmpngt_sd(__m128d __a, __m128d __b) 954 { 955 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 956 return (__m128d) { __c[0], __a[1] }; 957 } 958 959 /// \brief Compares the lower double-precision floating-point values in each of 960 /// the two 128-bit floating-point vectors of [2 x double] to determine if 961 /// the value in the first parameter is not greater than or equal to the 962 /// corresponding value in the second parameter. 963 /// 964 /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 965 /// 966 /// \headerfile <x86intrin.h> 967 /// 968 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 969 /// 970 /// \param __a 971 /// A 128-bit vector of [2 x double]. The lower double-precision value is 972 /// compared to the lower double-precision value of \a __b. 973 /// \param __b 974 /// A 128-bit vector of [2 x double]. The lower double-precision value is 975 /// compared to the lower double-precision value of \a __a. 976 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 977 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 978 static __inline__ __m128d __DEFAULT_FN_ATTRS 979 _mm_cmpnge_sd(__m128d __a, __m128d __b) 980 { 981 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 982 return (__m128d) { __c[0], __a[1] }; 983 } 984 985 /// \brief Compares the lower double-precision floating-point values in each of 986 /// the two 128-bit floating-point vectors of [2 x double] for equality. The 987 /// comparison yields 0 for false, 1 for true. 988 /// 989 /// \headerfile <x86intrin.h> 990 /// 991 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 992 /// 993 /// \param __a 994 /// A 128-bit vector of [2 x double]. The lower double-precision value is 995 /// compared to the lower double-precision value of \a __b. 996 /// \param __b 997 /// A 128-bit vector of [2 x double]. The lower double-precision value is 998 /// compared to the lower double-precision value of \a __a. 999 /// \returns An integer containing the comparison results. 1000 static __inline__ int __DEFAULT_FN_ATTRS 1001 _mm_comieq_sd(__m128d __a, __m128d __b) 1002 { 1003 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 1004 } 1005 1006 /// \brief Compares the lower double-precision floating-point values in each of 1007 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1008 /// the value in the first parameter is less than the corresponding value in 1009 /// the second parameter. 1010 /// 1011 /// The comparison yields 0 for false, 1 for true. 1012 /// 1013 /// \headerfile <x86intrin.h> 1014 /// 1015 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1016 /// 1017 /// \param __a 1018 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1019 /// compared to the lower double-precision value of \a __b. 1020 /// \param __b 1021 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1022 /// compared to the lower double-precision value of \a __a. 1023 /// \returns An integer containing the comparison results. 1024 static __inline__ int __DEFAULT_FN_ATTRS 1025 _mm_comilt_sd(__m128d __a, __m128d __b) 1026 { 1027 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 1028 } 1029 1030 /// \brief Compares the lower double-precision floating-point values in each of 1031 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1032 /// the value in the first parameter is less than or equal to the 1033 /// corresponding value in the second parameter. 1034 /// 1035 /// The comparison yields 0 for false, 1 for true. 1036 /// 1037 /// \headerfile <x86intrin.h> 1038 /// 1039 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1040 /// 1041 /// \param __a 1042 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1043 /// compared to the lower double-precision value of \a __b. 1044 /// \param __b 1045 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1046 /// compared to the lower double-precision value of \a __a. 1047 /// \returns An integer containing the comparison results. 1048 static __inline__ int __DEFAULT_FN_ATTRS 1049 _mm_comile_sd(__m128d __a, __m128d __b) 1050 { 1051 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 1052 } 1053 1054 /// \brief Compares the lower double-precision floating-point values in each of 1055 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1056 /// the value in the first parameter is greater than the corresponding value 1057 /// in the second parameter. 1058 /// 1059 /// The comparison yields 0 for false, 1 for true. 1060 /// 1061 /// \headerfile <x86intrin.h> 1062 /// 1063 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1064 /// 1065 /// \param __a 1066 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1067 /// compared to the lower double-precision value of \a __b. 1068 /// \param __b 1069 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1070 /// compared to the lower double-precision value of \a __a. 1071 /// \returns An integer containing the comparison results. 1072 static __inline__ int __DEFAULT_FN_ATTRS 1073 _mm_comigt_sd(__m128d __a, __m128d __b) 1074 { 1075 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 1076 } 1077 1078 /// \brief Compares the lower double-precision floating-point values in each of 1079 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1080 /// the value in the first parameter is greater than or equal to the 1081 /// corresponding value in the second parameter. 1082 /// 1083 /// The comparison yields 0 for false, 1 for true. 1084 /// 1085 /// \headerfile <x86intrin.h> 1086 /// 1087 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1088 /// 1089 /// \param __a 1090 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1091 /// compared to the lower double-precision value of \a __b. 1092 /// \param __b 1093 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1094 /// compared to the lower double-precision value of \a __a. 1095 /// \returns An integer containing the comparison results. 1096 static __inline__ int __DEFAULT_FN_ATTRS 1097 _mm_comige_sd(__m128d __a, __m128d __b) 1098 { 1099 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 1100 } 1101 1102 /// \brief Compares the lower double-precision floating-point values in each of 1103 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1104 /// the value in the first parameter is unequal to the corresponding value in 1105 /// the second parameter. 1106 /// 1107 /// The comparison yields 0 for false, 1 for true. 1108 /// 1109 /// \headerfile <x86intrin.h> 1110 /// 1111 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1112 /// 1113 /// \param __a 1114 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1115 /// compared to the lower double-precision value of \a __b. 1116 /// \param __b 1117 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1118 /// compared to the lower double-precision value of \a __a. 1119 /// \returns An integer containing the comparison results. 1120 static __inline__ int __DEFAULT_FN_ATTRS 1121 _mm_comineq_sd(__m128d __a, __m128d __b) 1122 { 1123 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 1124 } 1125 1126 /// \brief Compares the lower double-precision floating-point values in each of 1127 /// the two 128-bit floating-point vectors of [2 x double] for equality. The 1128 /// comparison yields 0 for false, 1 for true. 1129 /// 1130 /// If either of the two lower double-precision values is NaN, 1 is returned. 1131 /// 1132 /// \headerfile <x86intrin.h> 1133 /// 1134 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1135 /// 1136 /// \param __a 1137 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1138 /// compared to the lower double-precision value of \a __b. 1139 /// \param __b 1140 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1141 /// compared to the lower double-precision value of \a __a. 1142 /// \returns An integer containing the comparison results. If either of the two 1143 /// lower double-precision values is NaN, 1 is returned. 1144 static __inline__ int __DEFAULT_FN_ATTRS 1145 _mm_ucomieq_sd(__m128d __a, __m128d __b) 1146 { 1147 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 1148 } 1149 1150 /// \brief Compares the lower double-precision floating-point values in each of 1151 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1152 /// the value in the first parameter is less than the corresponding value in 1153 /// the second parameter. 1154 /// 1155 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1156 /// double-precision values is NaN, 1 is returned. 1157 /// 1158 /// \headerfile <x86intrin.h> 1159 /// 1160 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1161 /// 1162 /// \param __a 1163 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1164 /// compared to the lower double-precision value of \a __b. 1165 /// \param __b 1166 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1167 /// compared to the lower double-precision value of \a __a. 1168 /// \returns An integer containing the comparison results. If either of the two 1169 /// lower double-precision values is NaN, 1 is returned. 1170 static __inline__ int __DEFAULT_FN_ATTRS 1171 _mm_ucomilt_sd(__m128d __a, __m128d __b) 1172 { 1173 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 1174 } 1175 1176 /// \brief Compares the lower double-precision floating-point values in each of 1177 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1178 /// the value in the first parameter is less than or equal to the 1179 /// corresponding value in the second parameter. 1180 /// 1181 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1182 /// double-precision values is NaN, 1 is returned. 1183 /// 1184 /// \headerfile <x86intrin.h> 1185 /// 1186 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1187 /// 1188 /// \param __a 1189 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1190 /// compared to the lower double-precision value of \a __b. 1191 /// \param __b 1192 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1193 /// compared to the lower double-precision value of \a __a. 1194 /// \returns An integer containing the comparison results. If either of the two 1195 /// lower double-precision values is NaN, 1 is returned. 1196 static __inline__ int __DEFAULT_FN_ATTRS 1197 _mm_ucomile_sd(__m128d __a, __m128d __b) 1198 { 1199 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 1200 } 1201 1202 /// \brief Compares the lower double-precision floating-point values in each of 1203 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1204 /// the value in the first parameter is greater than the corresponding value 1205 /// in the second parameter. 1206 /// 1207 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1208 /// double-precision values is NaN, 0 is returned. 1209 /// 1210 /// \headerfile <x86intrin.h> 1211 /// 1212 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1213 /// 1214 /// \param __a 1215 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1216 /// compared to the lower double-precision value of \a __b. 1217 /// \param __b 1218 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1219 /// compared to the lower double-precision value of \a __a. 1220 /// \returns An integer containing the comparison results. If either of the two 1221 /// lower double-precision values is NaN, 0 is returned. 1222 static __inline__ int __DEFAULT_FN_ATTRS 1223 _mm_ucomigt_sd(__m128d __a, __m128d __b) 1224 { 1225 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 1226 } 1227 1228 /// \brief Compares the lower double-precision floating-point values in each of 1229 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1230 /// the value in the first parameter is greater than or equal to the 1231 /// corresponding value in the second parameter. 1232 /// 1233 /// The comparison yields 0 for false, 1 for true. If either of the two 1234 /// lower double-precision values is NaN, 0 is returned. 1235 /// 1236 /// \headerfile <x86intrin.h> 1237 /// 1238 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1239 /// 1240 /// \param __a 1241 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1242 /// compared to the lower double-precision value of \a __b. 1243 /// \param __b 1244 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1245 /// compared to the lower double-precision value of \a __a. 1246 /// \returns An integer containing the comparison results. If either of the two 1247 /// lower double-precision values is NaN, 0 is returned. 1248 static __inline__ int __DEFAULT_FN_ATTRS 1249 _mm_ucomige_sd(__m128d __a, __m128d __b) 1250 { 1251 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 1252 } 1253 1254 /// \brief Compares the lower double-precision floating-point values in each of 1255 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1256 /// the value in the first parameter is unequal to the corresponding value in 1257 /// the second parameter. 1258 /// 1259 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1260 /// double-precision values is NaN, 0 is returned. 1261 /// 1262 /// \headerfile <x86intrin.h> 1263 /// 1264 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1265 /// 1266 /// \param __a 1267 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1268 /// compared to the lower double-precision value of \a __b. 1269 /// \param __b 1270 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1271 /// compared to the lower double-precision value of \a __a. 1272 /// \returns An integer containing the comparison result. If either of the two 1273 /// lower double-precision values is NaN, 0 is returned. 1274 static __inline__ int __DEFAULT_FN_ATTRS 1275 _mm_ucomineq_sd(__m128d __a, __m128d __b) 1276 { 1277 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 1278 } 1279 1280 /// \brief Converts the two double-precision floating-point elements of a 1281 /// 128-bit vector of [2 x double] into two single-precision floating-point 1282 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. 1283 /// The upper 64 bits of the result vector are set to zero. 1284 /// 1285 /// \headerfile <x86intrin.h> 1286 /// 1287 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction. 1288 /// 1289 /// \param __a 1290 /// A 128-bit vector of [2 x double]. 1291 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1292 /// converted values. The upper 64 bits are set to zero. 1293 static __inline__ __m128 __DEFAULT_FN_ATTRS 1294 _mm_cvtpd_ps(__m128d __a) 1295 { 1296 return __builtin_ia32_cvtpd2ps((__v2df)__a); 1297 } 1298 1299 /// \brief Converts the lower two single-precision floating-point elements of a 1300 /// 128-bit vector of [4 x float] into two double-precision floating-point 1301 /// values, returned in a 128-bit vector of [2 x double]. The upper two 1302 /// elements of the input vector are unused. 1303 /// 1304 /// \headerfile <x86intrin.h> 1305 /// 1306 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction. 1307 /// 1308 /// \param __a 1309 /// A 128-bit vector of [4 x float]. The lower two single-precision 1310 /// floating-point elements are converted to double-precision values. The 1311 /// upper two elements are unused. 1312 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1313 static __inline__ __m128d __DEFAULT_FN_ATTRS 1314 _mm_cvtps_pd(__m128 __a) 1315 { 1316 return (__m128d) __builtin_convertvector( 1317 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 1318 } 1319 1320 /// \brief Converts the lower two integer elements of a 128-bit vector of 1321 /// [4 x i32] into two double-precision floating-point values, returned in a 1322 /// 128-bit vector of [2 x double]. 1323 /// 1324 /// The upper two elements of the input vector are unused. 1325 /// 1326 /// \headerfile <x86intrin.h> 1327 /// 1328 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction. 1329 /// 1330 /// \param __a 1331 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are 1332 /// converted to double-precision values. 1333 /// 1334 /// The upper two elements are unused. 1335 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1336 static __inline__ __m128d __DEFAULT_FN_ATTRS 1337 _mm_cvtepi32_pd(__m128i __a) 1338 { 1339 return (__m128d) __builtin_convertvector( 1340 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 1341 } 1342 1343 /// \brief Converts the two double-precision floating-point elements of a 1344 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1345 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper 1346 /// 64 bits of the result vector are set to zero. 1347 /// 1348 /// \headerfile <x86intrin.h> 1349 /// 1350 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction. 1351 /// 1352 /// \param __a 1353 /// A 128-bit vector of [2 x double]. 1354 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1355 /// converted values. The upper 64 bits are set to zero. 1356 static __inline__ __m128i __DEFAULT_FN_ATTRS 1357 _mm_cvtpd_epi32(__m128d __a) 1358 { 1359 return __builtin_ia32_cvtpd2dq((__v2df)__a); 1360 } 1361 1362 /// \brief Converts the low-order element of a 128-bit vector of [2 x double] 1363 /// into a 32-bit signed integer value. 1364 /// 1365 /// \headerfile <x86intrin.h> 1366 /// 1367 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 1368 /// 1369 /// \param __a 1370 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1371 /// conversion. 1372 /// \returns A 32-bit signed integer containing the converted value. 1373 static __inline__ int __DEFAULT_FN_ATTRS 1374 _mm_cvtsd_si32(__m128d __a) 1375 { 1376 return __builtin_ia32_cvtsd2si((__v2df)__a); 1377 } 1378 1379 /// \brief Converts the lower double-precision floating-point element of a 1380 /// 128-bit vector of [2 x double], in the second parameter, into a 1381 /// single-precision floating-point value, returned in the lower 32 bits of a 1382 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are 1383 /// copied from the upper 96 bits of the first parameter. 1384 /// 1385 /// \headerfile <x86intrin.h> 1386 /// 1387 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction. 1388 /// 1389 /// \param __a 1390 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are 1391 /// copied to the upper 96 bits of the result. 1392 /// \param __b 1393 /// A 128-bit vector of [2 x double]. The lower double-precision 1394 /// floating-point element is used in the conversion. 1395 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the 1396 /// converted value from the second parameter. The upper 96 bits are copied 1397 /// from the upper 96 bits of the first parameter. 1398 static __inline__ __m128 __DEFAULT_FN_ATTRS 1399 _mm_cvtsd_ss(__m128 __a, __m128d __b) 1400 { 1401 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); 1402 } 1403 1404 /// \brief Converts a 32-bit signed integer value, in the second parameter, into 1405 /// a double-precision floating-point value, returned in the lower 64 bits of 1406 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1407 /// are copied from the upper 64 bits of the first parameter. 1408 /// 1409 /// \headerfile <x86intrin.h> 1410 /// 1411 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 1412 /// 1413 /// \param __a 1414 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1415 /// copied to the upper 64 bits of the result. 1416 /// \param __b 1417 /// A 32-bit signed integer containing the value to be converted. 1418 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1419 /// converted value from the second parameter. The upper 64 bits are copied 1420 /// from the upper 64 bits of the first parameter. 1421 static __inline__ __m128d __DEFAULT_FN_ATTRS 1422 _mm_cvtsi32_sd(__m128d __a, int __b) 1423 { 1424 __a[0] = __b; 1425 return __a; 1426 } 1427 1428 /// \brief Converts the lower single-precision floating-point element of a 1429 /// 128-bit vector of [4 x float], in the second parameter, into a 1430 /// double-precision floating-point value, returned in the lower 64 bits of 1431 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1432 /// are copied from the upper 64 bits of the first parameter. 1433 /// 1434 /// \headerfile <x86intrin.h> 1435 /// 1436 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction. 1437 /// 1438 /// \param __a 1439 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1440 /// copied to the upper 64 bits of the result. 1441 /// \param __b 1442 /// A 128-bit vector of [4 x float]. The lower single-precision 1443 /// floating-point element is used in the conversion. 1444 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1445 /// converted value from the second parameter. The upper 64 bits are copied 1446 /// from the upper 64 bits of the first parameter. 1447 static __inline__ __m128d __DEFAULT_FN_ATTRS 1448 _mm_cvtss_sd(__m128d __a, __m128 __b) 1449 { 1450 __a[0] = __b[0]; 1451 return __a; 1452 } 1453 1454 /// \brief Converts the two double-precision floating-point elements of a 1455 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1456 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. 1457 /// 1458 /// If the result of either conversion is inexact, the result is truncated 1459 /// (rounded towards zero) regardless of the current MXCSR setting. The upper 1460 /// 64 bits of the result vector are set to zero. 1461 /// 1462 /// \headerfile <x86intrin.h> 1463 /// 1464 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c> 1465 /// instruction. 1466 /// 1467 /// \param __a 1468 /// A 128-bit vector of [2 x double]. 1469 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1470 /// converted values. The upper 64 bits are set to zero. 1471 static __inline__ __m128i __DEFAULT_FN_ATTRS 1472 _mm_cvttpd_epi32(__m128d __a) 1473 { 1474 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 1475 } 1476 1477 /// \brief Converts the low-order element of a [2 x double] vector into a 32-bit 1478 /// signed integer value, truncating the result when it is inexact. 1479 /// 1480 /// \headerfile <x86intrin.h> 1481 /// 1482 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 1483 /// instruction. 1484 /// 1485 /// \param __a 1486 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1487 /// conversion. 1488 /// \returns A 32-bit signed integer containing the converted value. 1489 static __inline__ int __DEFAULT_FN_ATTRS 1490 _mm_cvttsd_si32(__m128d __a) 1491 { 1492 return __builtin_ia32_cvttsd2si((__v2df)__a); 1493 } 1494 1495 /// \brief Converts the two double-precision floating-point elements of a 1496 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1497 /// returned in a 64-bit vector of [2 x i32]. 1498 /// 1499 /// \headerfile <x86intrin.h> 1500 /// 1501 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction. 1502 /// 1503 /// \param __a 1504 /// A 128-bit vector of [2 x double]. 1505 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1506 static __inline__ __m64 __DEFAULT_FN_ATTRS 1507 _mm_cvtpd_pi32(__m128d __a) 1508 { 1509 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 1510 } 1511 1512 /// \brief Converts the two double-precision floating-point elements of a 1513 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1514 /// returned in a 64-bit vector of [2 x i32]. 1515 /// 1516 /// If the result of either conversion is inexact, the result is truncated 1517 /// (rounded towards zero) regardless of the current MXCSR setting. 1518 /// 1519 /// \headerfile <x86intrin.h> 1520 /// 1521 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction. 1522 /// 1523 /// \param __a 1524 /// A 128-bit vector of [2 x double]. 1525 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1526 static __inline__ __m64 __DEFAULT_FN_ATTRS 1527 _mm_cvttpd_pi32(__m128d __a) 1528 { 1529 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 1530 } 1531 1532 /// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of 1533 /// [2 x i32] into two double-precision floating-point values, returned in a 1534 /// 128-bit vector of [2 x double]. 1535 /// 1536 /// \headerfile <x86intrin.h> 1537 /// 1538 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction. 1539 /// 1540 /// \param __a 1541 /// A 64-bit vector of [2 x i32]. 1542 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1543 static __inline__ __m128d __DEFAULT_FN_ATTRS 1544 _mm_cvtpi32_pd(__m64 __a) 1545 { 1546 return __builtin_ia32_cvtpi2pd((__v2si)__a); 1547 } 1548 1549 /// \brief Returns the low-order element of a 128-bit vector of [2 x double] as 1550 /// a double-precision floating-point value. 1551 /// 1552 /// \headerfile <x86intrin.h> 1553 /// 1554 /// This intrinsic has no corresponding instruction. 1555 /// 1556 /// \param __a 1557 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned. 1558 /// \returns A double-precision floating-point value copied from the lower 64 1559 /// bits of \a __a. 1560 static __inline__ double __DEFAULT_FN_ATTRS 1561 _mm_cvtsd_f64(__m128d __a) 1562 { 1563 return __a[0]; 1564 } 1565 1566 /// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned 1567 /// memory location. 1568 /// 1569 /// \headerfile <x86intrin.h> 1570 /// 1571 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1572 /// 1573 /// \param __dp 1574 /// A pointer to a 128-bit memory location. The address of the memory 1575 /// location has to be 16-byte aligned. 1576 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1577 static __inline__ __m128d __DEFAULT_FN_ATTRS 1578 _mm_load_pd(double const *__dp) 1579 { 1580 return *(__m128d*)__dp; 1581 } 1582 1583 /// \brief Loads a double-precision floating-point value from a specified memory 1584 /// location and duplicates it to both vector elements of a 128-bit vector of 1585 /// [2 x double]. 1586 /// 1587 /// \headerfile <x86intrin.h> 1588 /// 1589 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction. 1590 /// 1591 /// \param __dp 1592 /// A pointer to a memory location containing a double-precision value. 1593 /// \returns A 128-bit vector of [2 x double] containing the loaded and 1594 /// duplicated values. 1595 static __inline__ __m128d __DEFAULT_FN_ATTRS 1596 _mm_load1_pd(double const *__dp) 1597 { 1598 struct __mm_load1_pd_struct { 1599 double __u; 1600 } __attribute__((__packed__, __may_alias__)); 1601 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 1602 return (__m128d){ __u, __u }; 1603 } 1604 1605 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 1606 1607 /// \brief Loads two double-precision values, in reverse order, from an aligned 1608 /// memory location into a 128-bit vector of [2 x double]. 1609 /// 1610 /// \headerfile <x86intrin.h> 1611 /// 1612 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction + 1613 /// needed shuffling instructions. In AVX mode, the shuffling may be combined 1614 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. 1615 /// 1616 /// \param __dp 1617 /// A 16-byte aligned pointer to an array of double-precision values to be 1618 /// loaded in reverse order. 1619 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded 1620 /// values. 1621 static __inline__ __m128d __DEFAULT_FN_ATTRS 1622 _mm_loadr_pd(double const *__dp) 1623 { 1624 __m128d __u = *(__m128d*)__dp; 1625 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 1626 } 1627 1628 /// \brief Loads a 128-bit floating-point vector of [2 x double] from an 1629 /// unaligned memory location. 1630 /// 1631 /// \headerfile <x86intrin.h> 1632 /// 1633 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1634 /// 1635 /// \param __dp 1636 /// A pointer to a 128-bit memory location. The address of the memory 1637 /// location does not have to be aligned. 1638 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1639 static __inline__ __m128d __DEFAULT_FN_ATTRS 1640 _mm_loadu_pd(double const *__dp) 1641 { 1642 struct __loadu_pd { 1643 __m128d __v; 1644 } __attribute__((__packed__, __may_alias__)); 1645 return ((struct __loadu_pd*)__dp)->__v; 1646 } 1647 1648 /// \brief Loads a 64-bit integer value to the low element of a 128-bit integer 1649 /// vector and clears the upper element. 1650 /// 1651 /// \headerfile <x86intrin.h> 1652 /// 1653 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1654 /// 1655 /// \param __a 1656 /// A pointer to a 64-bit memory location. The address of the memory 1657 /// location does not have to be aligned. 1658 /// \returns A 128-bit vector of [2 x i64] containing the loaded value. 1659 static __inline__ __m128i __DEFAULT_FN_ATTRS 1660 _mm_loadu_si64(void const *__a) 1661 { 1662 struct __loadu_si64 { 1663 long long __v; 1664 } __attribute__((__packed__, __may_alias__)); 1665 long long __u = ((struct __loadu_si64*)__a)->__v; 1666 return (__m128i){__u, 0L}; 1667 } 1668 1669 /// \brief Loads a 64-bit double-precision value to the low element of a 1670 /// 128-bit integer vector and clears the upper element. 1671 /// 1672 /// \headerfile <x86intrin.h> 1673 /// 1674 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1675 /// 1676 /// \param __dp 1677 /// A pointer to a memory location containing a double-precision value. 1678 /// The address of the memory location does not have to be aligned. 1679 /// \returns A 128-bit vector of [2 x double] containing the loaded value. 1680 static __inline__ __m128d __DEFAULT_FN_ATTRS 1681 _mm_load_sd(double const *__dp) 1682 { 1683 struct __mm_load_sd_struct { 1684 double __u; 1685 } __attribute__((__packed__, __may_alias__)); 1686 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 1687 return (__m128d){ __u, 0 }; 1688 } 1689 1690 /// \brief Loads a double-precision value into the high-order bits of a 128-bit 1691 /// vector of [2 x double]. The low-order bits are copied from the low-order 1692 /// bits of the first operand. 1693 /// 1694 /// \headerfile <x86intrin.h> 1695 /// 1696 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1697 /// 1698 /// \param __a 1699 /// A 128-bit vector of [2 x double]. \n 1700 /// Bits [63:0] are written to bits [63:0] of the result. 1701 /// \param __dp 1702 /// A pointer to a 64-bit memory location containing a double-precision 1703 /// floating-point value that is loaded. The loaded value is written to bits 1704 /// [127:64] of the result. The address of the memory location does not have 1705 /// to be aligned. 1706 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1707 static __inline__ __m128d __DEFAULT_FN_ATTRS 1708 _mm_loadh_pd(__m128d __a, double const *__dp) 1709 { 1710 struct __mm_loadh_pd_struct { 1711 double __u; 1712 } __attribute__((__packed__, __may_alias__)); 1713 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 1714 return (__m128d){ __a[0], __u }; 1715 } 1716 1717 /// \brief Loads a double-precision value into the low-order bits of a 128-bit 1718 /// vector of [2 x double]. The high-order bits are copied from the 1719 /// high-order bits of the first operand. 1720 /// 1721 /// \headerfile <x86intrin.h> 1722 /// 1723 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1724 /// 1725 /// \param __a 1726 /// A 128-bit vector of [2 x double]. \n 1727 /// Bits [127:64] are written to bits [127:64] of the result. 1728 /// \param __dp 1729 /// A pointer to a 64-bit memory location containing a double-precision 1730 /// floating-point value that is loaded. The loaded value is written to bits 1731 /// [63:0] of the result. The address of the memory location does not have to 1732 /// be aligned. 1733 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1734 static __inline__ __m128d __DEFAULT_FN_ATTRS 1735 _mm_loadl_pd(__m128d __a, double const *__dp) 1736 { 1737 struct __mm_loadl_pd_struct { 1738 double __u; 1739 } __attribute__((__packed__, __may_alias__)); 1740 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 1741 return (__m128d){ __u, __a[1] }; 1742 } 1743 1744 /// \brief Constructs a 128-bit floating-point vector of [2 x double] with 1745 /// unspecified content. This could be used as an argument to another 1746 /// intrinsic function where the argument is required but the value is not 1747 /// actually used. 1748 /// 1749 /// \headerfile <x86intrin.h> 1750 /// 1751 /// This intrinsic has no corresponding instruction. 1752 /// 1753 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified 1754 /// content. 1755 static __inline__ __m128d __DEFAULT_FN_ATTRS 1756 _mm_undefined_pd(void) 1757 { 1758 return (__m128d)__builtin_ia32_undef128(); 1759 } 1760 1761 /// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower 1762 /// 64 bits of the vector are initialized with the specified double-precision 1763 /// floating-point value. The upper 64 bits are set to zero. 1764 /// 1765 /// \headerfile <x86intrin.h> 1766 /// 1767 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1768 /// 1769 /// \param __w 1770 /// A double-precision floating-point value used to initialize the lower 64 1771 /// bits of the result. 1772 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The 1773 /// lower 64 bits contain the value of the parameter. The upper 64 bits are 1774 /// set to zero. 1775 static __inline__ __m128d __DEFAULT_FN_ATTRS 1776 _mm_set_sd(double __w) 1777 { 1778 return (__m128d){ __w, 0 }; 1779 } 1780 1781 /// \brief Constructs a 128-bit floating-point vector of [2 x double], with each 1782 /// of the two double-precision floating-point vector elements set to the 1783 /// specified double-precision floating-point value. 1784 /// 1785 /// \headerfile <x86intrin.h> 1786 /// 1787 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1788 /// 1789 /// \param __w 1790 /// A double-precision floating-point value used to initialize each vector 1791 /// element of the result. 1792 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1793 static __inline__ __m128d __DEFAULT_FN_ATTRS 1794 _mm_set1_pd(double __w) 1795 { 1796 return (__m128d){ __w, __w }; 1797 } 1798 1799 /// \brief Constructs a 128-bit floating-point vector of [2 x double], with each 1800 /// of the two double-precision floating-point vector elements set to the 1801 /// specified double-precision floating-point value. 1802 /// 1803 /// \headerfile <x86intrin.h> 1804 /// 1805 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1806 /// 1807 /// \param __w 1808 /// A double-precision floating-point value used to initialize each vector 1809 /// element of the result. 1810 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1811 static __inline__ __m128d __DEFAULT_FN_ATTRS 1812 _mm_set_pd1(double __w) 1813 { 1814 return _mm_set1_pd(__w); 1815 } 1816 1817 /// \brief Constructs a 128-bit floating-point vector of [2 x double] 1818 /// initialized with the specified double-precision floating-point values. 1819 /// 1820 /// \headerfile <x86intrin.h> 1821 /// 1822 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1823 /// 1824 /// \param __w 1825 /// A double-precision floating-point value used to initialize the upper 64 1826 /// bits of the result. 1827 /// \param __x 1828 /// A double-precision floating-point value used to initialize the lower 64 1829 /// bits of the result. 1830 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1831 static __inline__ __m128d __DEFAULT_FN_ATTRS 1832 _mm_set_pd(double __w, double __x) 1833 { 1834 return (__m128d){ __x, __w }; 1835 } 1836 1837 /// \brief Constructs a 128-bit floating-point vector of [2 x double], 1838 /// initialized in reverse order with the specified double-precision 1839 /// floating-point values. 1840 /// 1841 /// \headerfile <x86intrin.h> 1842 /// 1843 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1844 /// 1845 /// \param __w 1846 /// A double-precision floating-point value used to initialize the lower 64 1847 /// bits of the result. 1848 /// \param __x 1849 /// A double-precision floating-point value used to initialize the upper 64 1850 /// bits of the result. 1851 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1852 static __inline__ __m128d __DEFAULT_FN_ATTRS 1853 _mm_setr_pd(double __w, double __x) 1854 { 1855 return (__m128d){ __w, __x }; 1856 } 1857 1858 /// \brief Constructs a 128-bit floating-point vector of [2 x double] 1859 /// initialized to zero. 1860 /// 1861 /// \headerfile <x86intrin.h> 1862 /// 1863 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1864 /// 1865 /// \returns An initialized 128-bit floating-point vector of [2 x double] with 1866 /// all elements set to zero. 1867 static __inline__ __m128d __DEFAULT_FN_ATTRS 1868 _mm_setzero_pd(void) 1869 { 1870 return (__m128d){ 0, 0 }; 1871 } 1872 1873 /// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower 1874 /// 64 bits are set to the lower 64 bits of the second parameter. The upper 1875 /// 64 bits are set to the upper 64 bits of the first parameter. 1876 /// 1877 /// \headerfile <x86intrin.h> 1878 /// 1879 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 1880 /// 1881 /// \param __a 1882 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the 1883 /// upper 64 bits of the result. 1884 /// \param __b 1885 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the 1886 /// lower 64 bits of the result. 1887 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1888 static __inline__ __m128d __DEFAULT_FN_ATTRS 1889 _mm_move_sd(__m128d __a, __m128d __b) 1890 { 1891 return (__m128d){ __b[0], __a[1] }; 1892 } 1893 1894 /// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1895 /// memory location. 1896 /// 1897 /// \headerfile <x86intrin.h> 1898 /// 1899 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1900 /// 1901 /// \param __dp 1902 /// A pointer to a 64-bit memory location. 1903 /// \param __a 1904 /// A 128-bit vector of [2 x double] containing the value to be stored. 1905 static __inline__ void __DEFAULT_FN_ATTRS 1906 _mm_store_sd(double *__dp, __m128d __a) 1907 { 1908 struct __mm_store_sd_struct { 1909 double __u; 1910 } __attribute__((__packed__, __may_alias__)); 1911 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 1912 } 1913 1914 /// \brief Moves packed double-precision values from a 128-bit vector of 1915 /// [2 x double] to a memory location. 1916 /// 1917 /// \headerfile <x86intrin.h> 1918 /// 1919 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction. 1920 /// 1921 /// \param __dp 1922 /// A pointer to an aligned memory location that can store two 1923 /// double-precision values. 1924 /// \param __a 1925 /// A packed 128-bit vector of [2 x double] containing the values to be 1926 /// moved. 1927 static __inline__ void __DEFAULT_FN_ATTRS 1928 _mm_store_pd(double *__dp, __m128d __a) 1929 { 1930 *(__m128d*)__dp = __a; 1931 } 1932 1933 /// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1934 /// the upper and lower 64 bits of a memory location. 1935 /// 1936 /// \headerfile <x86intrin.h> 1937 /// 1938 /// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1939 /// 1940 /// \param __dp 1941 /// A pointer to a memory location that can store two double-precision 1942 /// values. 1943 /// \param __a 1944 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1945 /// of the values in \a dp. 1946 static __inline__ void __DEFAULT_FN_ATTRS 1947 _mm_store1_pd(double *__dp, __m128d __a) 1948 { 1949 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 1950 _mm_store_pd(__dp, __a); 1951 } 1952 1953 /// \brief Stores a 128-bit vector of [2 x double] into an aligned memory 1954 /// location. 1955 /// 1956 /// \headerfile <x86intrin.h> 1957 /// 1958 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1959 /// 1960 /// \param __dp 1961 /// A pointer to a 128-bit memory location. The address of the memory 1962 /// location has to be 16-byte aligned. 1963 /// \param __a 1964 /// A 128-bit vector of [2 x double] containing the values to be stored. 1965 static __inline__ void __DEFAULT_FN_ATTRS 1966 _mm_store_pd1(double *__dp, __m128d __a) 1967 { 1968 return _mm_store1_pd(__dp, __a); 1969 } 1970 1971 /// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory 1972 /// location. 1973 /// 1974 /// \headerfile <x86intrin.h> 1975 /// 1976 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1977 /// 1978 /// \param __dp 1979 /// A pointer to a 128-bit memory location. The address of the memory 1980 /// location does not have to be aligned. 1981 /// \param __a 1982 /// A 128-bit vector of [2 x double] containing the values to be stored. 1983 static __inline__ void __DEFAULT_FN_ATTRS 1984 _mm_storeu_pd(double *__dp, __m128d __a) 1985 { 1986 struct __storeu_pd { 1987 __m128d __v; 1988 } __attribute__((__packed__, __may_alias__)); 1989 ((struct __storeu_pd*)__dp)->__v = __a; 1990 } 1991 1992 /// \brief Stores two double-precision values, in reverse order, from a 128-bit 1993 /// vector of [2 x double] to a 16-byte aligned memory location. 1994 /// 1995 /// \headerfile <x86intrin.h> 1996 /// 1997 /// This intrinsic corresponds to a shuffling instruction followed by a 1998 /// <c> VMOVAPD / MOVAPD </c> instruction. 1999 /// 2000 /// \param __dp 2001 /// A pointer to a 16-byte aligned memory location that can store two 2002 /// double-precision values. 2003 /// \param __a 2004 /// A 128-bit vector of [2 x double] containing the values to be reversed and 2005 /// stored. 2006 static __inline__ void __DEFAULT_FN_ATTRS 2007 _mm_storer_pd(double *__dp, __m128d __a) 2008 { 2009 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 2010 *(__m128d *)__dp = __a; 2011 } 2012 2013 /// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a 2014 /// memory location. 2015 /// 2016 /// \headerfile <x86intrin.h> 2017 /// 2018 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 2019 /// 2020 /// \param __dp 2021 /// A pointer to a 64-bit memory location. 2022 /// \param __a 2023 /// A 128-bit vector of [2 x double] containing the value to be stored. 2024 static __inline__ void __DEFAULT_FN_ATTRS 2025 _mm_storeh_pd(double *__dp, __m128d __a) 2026 { 2027 struct __mm_storeh_pd_struct { 2028 double __u; 2029 } __attribute__((__packed__, __may_alias__)); 2030 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 2031 } 2032 2033 /// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 2034 /// memory location. 2035 /// 2036 /// \headerfile <x86intrin.h> 2037 /// 2038 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 2039 /// 2040 /// \param __dp 2041 /// A pointer to a 64-bit memory location. 2042 /// \param __a 2043 /// A 128-bit vector of [2 x double] containing the value to be stored. 2044 static __inline__ void __DEFAULT_FN_ATTRS 2045 _mm_storel_pd(double *__dp, __m128d __a) 2046 { 2047 struct __mm_storeh_pd_struct { 2048 double __u; 2049 } __attribute__((__packed__, __may_alias__)); 2050 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 2051 } 2052 2053 /// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8], 2054 /// saving the lower 8 bits of each sum in the corresponding element of a 2055 /// 128-bit result vector of [16 x i8]. 2056 /// 2057 /// The integer elements of both parameters can be either signed or unsigned. 2058 /// 2059 /// \headerfile <x86intrin.h> 2060 /// 2061 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction. 2062 /// 2063 /// \param __a 2064 /// A 128-bit vector of [16 x i8]. 2065 /// \param __b 2066 /// A 128-bit vector of [16 x i8]. 2067 /// \returns A 128-bit vector of [16 x i8] containing the sums of both 2068 /// parameters. 2069 static __inline__ __m128i __DEFAULT_FN_ATTRS 2070 _mm_add_epi8(__m128i __a, __m128i __b) 2071 { 2072 return (__m128i)((__v16qu)__a + (__v16qu)__b); 2073 } 2074 2075 /// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16], 2076 /// saving the lower 16 bits of each sum in the corresponding element of a 2077 /// 128-bit result vector of [8 x i16]. 2078 /// 2079 /// The integer elements of both parameters can be either signed or unsigned. 2080 /// 2081 /// \headerfile <x86intrin.h> 2082 /// 2083 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction. 2084 /// 2085 /// \param __a 2086 /// A 128-bit vector of [8 x i16]. 2087 /// \param __b 2088 /// A 128-bit vector of [8 x i16]. 2089 /// \returns A 128-bit vector of [8 x i16] containing the sums of both 2090 /// parameters. 2091 static __inline__ __m128i __DEFAULT_FN_ATTRS 2092 _mm_add_epi16(__m128i __a, __m128i __b) 2093 { 2094 return (__m128i)((__v8hu)__a + (__v8hu)__b); 2095 } 2096 2097 /// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32], 2098 /// saving the lower 32 bits of each sum in the corresponding element of a 2099 /// 128-bit result vector of [4 x i32]. 2100 /// 2101 /// The integer elements of both parameters can be either signed or unsigned. 2102 /// 2103 /// \headerfile <x86intrin.h> 2104 /// 2105 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction. 2106 /// 2107 /// \param __a 2108 /// A 128-bit vector of [4 x i32]. 2109 /// \param __b 2110 /// A 128-bit vector of [4 x i32]. 2111 /// \returns A 128-bit vector of [4 x i32] containing the sums of both 2112 /// parameters. 2113 static __inline__ __m128i __DEFAULT_FN_ATTRS 2114 _mm_add_epi32(__m128i __a, __m128i __b) 2115 { 2116 return (__m128i)((__v4su)__a + (__v4su)__b); 2117 } 2118 2119 /// \brief Adds two signed or unsigned 64-bit integer values, returning the 2120 /// lower 64 bits of the sum. 2121 /// 2122 /// \headerfile <x86intrin.h> 2123 /// 2124 /// This intrinsic corresponds to the <c> PADDQ </c> instruction. 2125 /// 2126 /// \param __a 2127 /// A 64-bit integer. 2128 /// \param __b 2129 /// A 64-bit integer. 2130 /// \returns A 64-bit integer containing the sum of both parameters. 2131 static __inline__ __m64 __DEFAULT_FN_ATTRS 2132 _mm_add_si64(__m64 __a, __m64 __b) 2133 { 2134 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 2135 } 2136 2137 /// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64], 2138 /// saving the lower 64 bits of each sum in the corresponding element of a 2139 /// 128-bit result vector of [2 x i64]. 2140 /// 2141 /// The integer elements of both parameters can be either signed or unsigned. 2142 /// 2143 /// \headerfile <x86intrin.h> 2144 /// 2145 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction. 2146 /// 2147 /// \param __a 2148 /// A 128-bit vector of [2 x i64]. 2149 /// \param __b 2150 /// A 128-bit vector of [2 x i64]. 2151 /// \returns A 128-bit vector of [2 x i64] containing the sums of both 2152 /// parameters. 2153 static __inline__ __m128i __DEFAULT_FN_ATTRS 2154 _mm_add_epi64(__m128i __a, __m128i __b) 2155 { 2156 return (__m128i)((__v2du)__a + (__v2du)__b); 2157 } 2158 2159 /// \brief Adds, with saturation, the corresponding elements of two 128-bit 2160 /// signed [16 x i8] vectors, saving each sum in the corresponding element of 2161 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are 2162 /// saturated to 7Fh. Negative sums less than 80h are saturated to 80h. 2163 /// 2164 /// \headerfile <x86intrin.h> 2165 /// 2166 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction. 2167 /// 2168 /// \param __a 2169 /// A 128-bit signed [16 x i8] vector. 2170 /// \param __b 2171 /// A 128-bit signed [16 x i8] vector. 2172 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of 2173 /// both parameters. 2174 static __inline__ __m128i __DEFAULT_FN_ATTRS 2175 _mm_adds_epi8(__m128i __a, __m128i __b) 2176 { 2177 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 2178 } 2179 2180 /// \brief Adds, with saturation, the corresponding elements of two 128-bit 2181 /// signed [8 x i16] vectors, saving each sum in the corresponding element of 2182 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh 2183 /// are saturated to 7FFFh. Negative sums less than 8000h are saturated to 2184 /// 8000h. 2185 /// 2186 /// \headerfile <x86intrin.h> 2187 /// 2188 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction. 2189 /// 2190 /// \param __a 2191 /// A 128-bit signed [8 x i16] vector. 2192 /// \param __b 2193 /// A 128-bit signed [8 x i16] vector. 2194 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of 2195 /// both parameters. 2196 static __inline__ __m128i __DEFAULT_FN_ATTRS 2197 _mm_adds_epi16(__m128i __a, __m128i __b) 2198 { 2199 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 2200 } 2201 2202 /// \brief Adds, with saturation, the corresponding elements of two 128-bit 2203 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element 2204 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh 2205 /// are saturated to FFh. Negative sums are saturated to 00h. 2206 /// 2207 /// \headerfile <x86intrin.h> 2208 /// 2209 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2210 /// 2211 /// \param __a 2212 /// A 128-bit unsigned [16 x i8] vector. 2213 /// \param __b 2214 /// A 128-bit unsigned [16 x i8] vector. 2215 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums 2216 /// of both parameters. 2217 static __inline__ __m128i __DEFAULT_FN_ATTRS 2218 _mm_adds_epu8(__m128i __a, __m128i __b) 2219 { 2220 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 2221 } 2222 2223 /// \brief Adds, with saturation, the corresponding elements of two 128-bit 2224 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element 2225 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh 2226 /// are saturated to FFFFh. Negative sums are saturated to 0000h. 2227 /// 2228 /// \headerfile <x86intrin.h> 2229 /// 2230 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2231 /// 2232 /// \param __a 2233 /// A 128-bit unsigned [8 x i16] vector. 2234 /// \param __b 2235 /// A 128-bit unsigned [8 x i16] vector. 2236 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums 2237 /// of both parameters. 2238 static __inline__ __m128i __DEFAULT_FN_ATTRS 2239 _mm_adds_epu16(__m128i __a, __m128i __b) 2240 { 2241 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 2242 } 2243 2244 /// \brief Computes the rounded avarages of corresponding elements of two 2245 /// 128-bit unsigned [16 x i8] vectors, saving each result in the 2246 /// corresponding element of a 128-bit result vector of [16 x i8]. 2247 /// 2248 /// \headerfile <x86intrin.h> 2249 /// 2250 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction. 2251 /// 2252 /// \param __a 2253 /// A 128-bit unsigned [16 x i8] vector. 2254 /// \param __b 2255 /// A 128-bit unsigned [16 x i8] vector. 2256 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded 2257 /// averages of both parameters. 2258 static __inline__ __m128i __DEFAULT_FN_ATTRS 2259 _mm_avg_epu8(__m128i __a, __m128i __b) 2260 { 2261 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 2262 return (__m128i)__builtin_convertvector( 2263 ((__builtin_convertvector((__v16qu)__a, __v16hu) + 2264 __builtin_convertvector((__v16qu)__b, __v16hu)) + 1) 2265 >> 1, __v16qu); 2266 } 2267 2268 /// \brief Computes the rounded avarages of corresponding elements of two 2269 /// 128-bit unsigned [8 x i16] vectors, saving each result in the 2270 /// corresponding element of a 128-bit result vector of [8 x i16]. 2271 /// 2272 /// \headerfile <x86intrin.h> 2273 /// 2274 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction. 2275 /// 2276 /// \param __a 2277 /// A 128-bit unsigned [8 x i16] vector. 2278 /// \param __b 2279 /// A 128-bit unsigned [8 x i16] vector. 2280 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded 2281 /// averages of both parameters. 2282 static __inline__ __m128i __DEFAULT_FN_ATTRS 2283 _mm_avg_epu16(__m128i __a, __m128i __b) 2284 { 2285 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 2286 return (__m128i)__builtin_convertvector( 2287 ((__builtin_convertvector((__v8hu)__a, __v8su) + 2288 __builtin_convertvector((__v8hu)__b, __v8su)) + 1) 2289 >> 1, __v8hu); 2290 } 2291 2292 /// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16] 2293 /// vectors, producing eight intermediate 32-bit signed integer products, and 2294 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed 2295 /// [4 x i32] vector. 2296 /// 2297 /// For example, bits [15:0] of both parameters are multiplied producing a 2298 /// 32-bit product, bits [31:16] of both parameters are multiplied producing 2299 /// a 32-bit product, and the sum of those two products becomes bits [31:0] 2300 /// of the result. 2301 /// 2302 /// \headerfile <x86intrin.h> 2303 /// 2304 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction. 2305 /// 2306 /// \param __a 2307 /// A 128-bit signed [8 x i16] vector. 2308 /// \param __b 2309 /// A 128-bit signed [8 x i16] vector. 2310 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products 2311 /// of both parameters. 2312 static __inline__ __m128i __DEFAULT_FN_ATTRS 2313 _mm_madd_epi16(__m128i __a, __m128i __b) 2314 { 2315 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 2316 } 2317 2318 /// \brief Compares corresponding elements of two 128-bit signed [8 x i16] 2319 /// vectors, saving the greater value from each comparison in the 2320 /// corresponding element of a 128-bit result vector of [8 x i16]. 2321 /// 2322 /// \headerfile <x86intrin.h> 2323 /// 2324 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction. 2325 /// 2326 /// \param __a 2327 /// A 128-bit signed [8 x i16] vector. 2328 /// \param __b 2329 /// A 128-bit signed [8 x i16] vector. 2330 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of 2331 /// each comparison. 2332 static __inline__ __m128i __DEFAULT_FN_ATTRS 2333 _mm_max_epi16(__m128i __a, __m128i __b) 2334 { 2335 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 2336 } 2337 2338 /// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8] 2339 /// vectors, saving the greater value from each comparison in the 2340 /// corresponding element of a 128-bit result vector of [16 x i8]. 2341 /// 2342 /// \headerfile <x86intrin.h> 2343 /// 2344 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction. 2345 /// 2346 /// \param __a 2347 /// A 128-bit unsigned [16 x i8] vector. 2348 /// \param __b 2349 /// A 128-bit unsigned [16 x i8] vector. 2350 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of 2351 /// each comparison. 2352 static __inline__ __m128i __DEFAULT_FN_ATTRS 2353 _mm_max_epu8(__m128i __a, __m128i __b) 2354 { 2355 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 2356 } 2357 2358 /// \brief Compares corresponding elements of two 128-bit signed [8 x i16] 2359 /// vectors, saving the smaller value from each comparison in the 2360 /// corresponding element of a 128-bit result vector of [8 x i16]. 2361 /// 2362 /// \headerfile <x86intrin.h> 2363 /// 2364 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction. 2365 /// 2366 /// \param __a 2367 /// A 128-bit signed [8 x i16] vector. 2368 /// \param __b 2369 /// A 128-bit signed [8 x i16] vector. 2370 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of 2371 /// each comparison. 2372 static __inline__ __m128i __DEFAULT_FN_ATTRS 2373 _mm_min_epi16(__m128i __a, __m128i __b) 2374 { 2375 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 2376 } 2377 2378 /// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8] 2379 /// vectors, saving the smaller value from each comparison in the 2380 /// corresponding element of a 128-bit result vector of [16 x i8]. 2381 /// 2382 /// \headerfile <x86intrin.h> 2383 /// 2384 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction. 2385 /// 2386 /// \param __a 2387 /// A 128-bit unsigned [16 x i8] vector. 2388 /// \param __b 2389 /// A 128-bit unsigned [16 x i8] vector. 2390 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of 2391 /// each comparison. 2392 static __inline__ __m128i __DEFAULT_FN_ATTRS 2393 _mm_min_epu8(__m128i __a, __m128i __b) 2394 { 2395 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 2396 } 2397 2398 /// \brief Multiplies the corresponding elements of two signed [8 x i16] 2399 /// vectors, saving the upper 16 bits of each 32-bit product in the 2400 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2401 /// 2402 /// \headerfile <x86intrin.h> 2403 /// 2404 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction. 2405 /// 2406 /// \param __a 2407 /// A 128-bit signed [8 x i16] vector. 2408 /// \param __b 2409 /// A 128-bit signed [8 x i16] vector. 2410 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of 2411 /// each of the eight 32-bit products. 2412 static __inline__ __m128i __DEFAULT_FN_ATTRS 2413 _mm_mulhi_epi16(__m128i __a, __m128i __b) 2414 { 2415 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 2416 } 2417 2418 /// \brief Multiplies the corresponding elements of two unsigned [8 x i16] 2419 /// vectors, saving the upper 16 bits of each 32-bit product in the 2420 /// corresponding element of a 128-bit unsigned [8 x i16] result vector. 2421 /// 2422 /// \headerfile <x86intrin.h> 2423 /// 2424 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction. 2425 /// 2426 /// \param __a 2427 /// A 128-bit unsigned [8 x i16] vector. 2428 /// \param __b 2429 /// A 128-bit unsigned [8 x i16] vector. 2430 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits 2431 /// of each of the eight 32-bit products. 2432 static __inline__ __m128i __DEFAULT_FN_ATTRS 2433 _mm_mulhi_epu16(__m128i __a, __m128i __b) 2434 { 2435 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 2436 } 2437 2438 /// \brief Multiplies the corresponding elements of two signed [8 x i16] 2439 /// vectors, saving the lower 16 bits of each 32-bit product in the 2440 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2441 /// 2442 /// \headerfile <x86intrin.h> 2443 /// 2444 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction. 2445 /// 2446 /// \param __a 2447 /// A 128-bit signed [8 x i16] vector. 2448 /// \param __b 2449 /// A 128-bit signed [8 x i16] vector. 2450 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of 2451 /// each of the eight 32-bit products. 2452 static __inline__ __m128i __DEFAULT_FN_ATTRS 2453 _mm_mullo_epi16(__m128i __a, __m128i __b) 2454 { 2455 return (__m128i)((__v8hu)__a * (__v8hu)__b); 2456 } 2457 2458 /// \brief Multiplies 32-bit unsigned integer values contained in the lower bits 2459 /// of the two 64-bit integer vectors and returns the 64-bit unsigned 2460 /// product. 2461 /// 2462 /// \headerfile <x86intrin.h> 2463 /// 2464 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction. 2465 /// 2466 /// \param __a 2467 /// A 64-bit integer containing one of the source operands. 2468 /// \param __b 2469 /// A 64-bit integer containing one of the source operands. 2470 /// \returns A 64-bit integer vector containing the product of both operands. 2471 static __inline__ __m64 __DEFAULT_FN_ATTRS 2472 _mm_mul_su32(__m64 __a, __m64 __b) 2473 { 2474 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 2475 } 2476 2477 /// \brief Multiplies 32-bit unsigned integer values contained in the lower 2478 /// bits of the corresponding elements of two [2 x i64] vectors, and returns 2479 /// the 64-bit products in the corresponding elements of a [2 x i64] vector. 2480 /// 2481 /// \headerfile <x86intrin.h> 2482 /// 2483 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction. 2484 /// 2485 /// \param __a 2486 /// A [2 x i64] vector containing one of the source operands. 2487 /// \param __b 2488 /// A [2 x i64] vector containing one of the source operands. 2489 /// \returns A [2 x i64] vector containing the product of both operands. 2490 static __inline__ __m128i __DEFAULT_FN_ATTRS 2491 _mm_mul_epu32(__m128i __a, __m128i __b) 2492 { 2493 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 2494 } 2495 2496 /// \brief Computes the absolute differences of corresponding 8-bit integer 2497 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and 2498 /// separately sums the second 8 absolute differences. Packs these two 2499 /// unsigned 16-bit integer sums into the upper and lower elements of a 2500 /// [2 x i64] vector. 2501 /// 2502 /// \headerfile <x86intrin.h> 2503 /// 2504 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction. 2505 /// 2506 /// \param __a 2507 /// A 128-bit integer vector containing one of the source operands. 2508 /// \param __b 2509 /// A 128-bit integer vector containing one of the source operands. 2510 /// \returns A [2 x i64] vector containing the sums of the sets of absolute 2511 /// differences between both operands. 2512 static __inline__ __m128i __DEFAULT_FN_ATTRS 2513 _mm_sad_epu8(__m128i __a, __m128i __b) 2514 { 2515 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 2516 } 2517 2518 /// \brief Subtracts the corresponding 8-bit integer values in the operands. 2519 /// 2520 /// \headerfile <x86intrin.h> 2521 /// 2522 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction. 2523 /// 2524 /// \param __a 2525 /// A 128-bit integer vector containing the minuends. 2526 /// \param __b 2527 /// A 128-bit integer vector containing the subtrahends. 2528 /// \returns A 128-bit integer vector containing the differences of the values 2529 /// in the operands. 2530 static __inline__ __m128i __DEFAULT_FN_ATTRS 2531 _mm_sub_epi8(__m128i __a, __m128i __b) 2532 { 2533 return (__m128i)((__v16qu)__a - (__v16qu)__b); 2534 } 2535 2536 /// \brief Subtracts the corresponding 16-bit integer values in the operands. 2537 /// 2538 /// \headerfile <x86intrin.h> 2539 /// 2540 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction. 2541 /// 2542 /// \param __a 2543 /// A 128-bit integer vector containing the minuends. 2544 /// \param __b 2545 /// A 128-bit integer vector containing the subtrahends. 2546 /// \returns A 128-bit integer vector containing the differences of the values 2547 /// in the operands. 2548 static __inline__ __m128i __DEFAULT_FN_ATTRS 2549 _mm_sub_epi16(__m128i __a, __m128i __b) 2550 { 2551 return (__m128i)((__v8hu)__a - (__v8hu)__b); 2552 } 2553 2554 /// \brief Subtracts the corresponding 32-bit integer values in the operands. 2555 /// 2556 /// \headerfile <x86intrin.h> 2557 /// 2558 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction. 2559 /// 2560 /// \param __a 2561 /// A 128-bit integer vector containing the minuends. 2562 /// \param __b 2563 /// A 128-bit integer vector containing the subtrahends. 2564 /// \returns A 128-bit integer vector containing the differences of the values 2565 /// in the operands. 2566 static __inline__ __m128i __DEFAULT_FN_ATTRS 2567 _mm_sub_epi32(__m128i __a, __m128i __b) 2568 { 2569 return (__m128i)((__v4su)__a - (__v4su)__b); 2570 } 2571 2572 /// \brief Subtracts signed or unsigned 64-bit integer values and writes the 2573 /// difference to the corresponding bits in the destination. 2574 /// 2575 /// \headerfile <x86intrin.h> 2576 /// 2577 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction. 2578 /// 2579 /// \param __a 2580 /// A 64-bit integer vector containing the minuend. 2581 /// \param __b 2582 /// A 64-bit integer vector containing the subtrahend. 2583 /// \returns A 64-bit integer vector containing the difference of the values in 2584 /// the operands. 2585 static __inline__ __m64 __DEFAULT_FN_ATTRS 2586 _mm_sub_si64(__m64 __a, __m64 __b) 2587 { 2588 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 2589 } 2590 2591 /// \brief Subtracts the corresponding elements of two [2 x i64] vectors. 2592 /// 2593 /// \headerfile <x86intrin.h> 2594 /// 2595 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction. 2596 /// 2597 /// \param __a 2598 /// A 128-bit integer vector containing the minuends. 2599 /// \param __b 2600 /// A 128-bit integer vector containing the subtrahends. 2601 /// \returns A 128-bit integer vector containing the differences of the values 2602 /// in the operands. 2603 static __inline__ __m128i __DEFAULT_FN_ATTRS 2604 _mm_sub_epi64(__m128i __a, __m128i __b) 2605 { 2606 return (__m128i)((__v2du)__a - (__v2du)__b); 2607 } 2608 2609 /// \brief Subtracts corresponding 8-bit signed integer values in the input and 2610 /// returns the differences in the corresponding bytes in the destination. 2611 /// Differences greater than 7Fh are saturated to 7Fh, and differences less 2612 /// than 80h are saturated to 80h. 2613 /// 2614 /// \headerfile <x86intrin.h> 2615 /// 2616 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction. 2617 /// 2618 /// \param __a 2619 /// A 128-bit integer vector containing the minuends. 2620 /// \param __b 2621 /// A 128-bit integer vector containing the subtrahends. 2622 /// \returns A 128-bit integer vector containing the differences of the values 2623 /// in the operands. 2624 static __inline__ __m128i __DEFAULT_FN_ATTRS 2625 _mm_subs_epi8(__m128i __a, __m128i __b) 2626 { 2627 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 2628 } 2629 2630 /// \brief Subtracts corresponding 16-bit signed integer values in the input and 2631 /// returns the differences in the corresponding bytes in the destination. 2632 /// Differences greater than 7FFFh are saturated to 7FFFh, and values less 2633 /// than 8000h are saturated to 8000h. 2634 /// 2635 /// \headerfile <x86intrin.h> 2636 /// 2637 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction. 2638 /// 2639 /// \param __a 2640 /// A 128-bit integer vector containing the minuends. 2641 /// \param __b 2642 /// A 128-bit integer vector containing the subtrahends. 2643 /// \returns A 128-bit integer vector containing the differences of the values 2644 /// in the operands. 2645 static __inline__ __m128i __DEFAULT_FN_ATTRS 2646 _mm_subs_epi16(__m128i __a, __m128i __b) 2647 { 2648 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 2649 } 2650 2651 /// \brief Subtracts corresponding 8-bit unsigned integer values in the input 2652 /// and returns the differences in the corresponding bytes in the 2653 /// destination. Differences less than 00h are saturated to 00h. 2654 /// 2655 /// \headerfile <x86intrin.h> 2656 /// 2657 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction. 2658 /// 2659 /// \param __a 2660 /// A 128-bit integer vector containing the minuends. 2661 /// \param __b 2662 /// A 128-bit integer vector containing the subtrahends. 2663 /// \returns A 128-bit integer vector containing the unsigned integer 2664 /// differences of the values in the operands. 2665 static __inline__ __m128i __DEFAULT_FN_ATTRS 2666 _mm_subs_epu8(__m128i __a, __m128i __b) 2667 { 2668 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 2669 } 2670 2671 /// \brief Subtracts corresponding 16-bit unsigned integer values in the input 2672 /// and returns the differences in the corresponding bytes in the 2673 /// destination. Differences less than 0000h are saturated to 0000h. 2674 /// 2675 /// \headerfile <x86intrin.h> 2676 /// 2677 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction. 2678 /// 2679 /// \param __a 2680 /// A 128-bit integer vector containing the minuends. 2681 /// \param __b 2682 /// A 128-bit integer vector containing the subtrahends. 2683 /// \returns A 128-bit integer vector containing the unsigned integer 2684 /// differences of the values in the operands. 2685 static __inline__ __m128i __DEFAULT_FN_ATTRS 2686 _mm_subs_epu16(__m128i __a, __m128i __b) 2687 { 2688 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 2689 } 2690 2691 /// \brief Performs a bitwise AND of two 128-bit integer vectors. 2692 /// 2693 /// \headerfile <x86intrin.h> 2694 /// 2695 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 2696 /// 2697 /// \param __a 2698 /// A 128-bit integer vector containing one of the source operands. 2699 /// \param __b 2700 /// A 128-bit integer vector containing one of the source operands. 2701 /// \returns A 128-bit integer vector containing the bitwise AND of the values 2702 /// in both operands. 2703 static __inline__ __m128i __DEFAULT_FN_ATTRS 2704 _mm_and_si128(__m128i __a, __m128i __b) 2705 { 2706 return (__m128i)((__v2du)__a & (__v2du)__b); 2707 } 2708 2709 /// \brief Performs a bitwise AND of two 128-bit integer vectors, using the 2710 /// one's complement of the values contained in the first source operand. 2711 /// 2712 /// \headerfile <x86intrin.h> 2713 /// 2714 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 2715 /// 2716 /// \param __a 2717 /// A 128-bit vector containing the left source operand. The one's complement 2718 /// of this value is used in the bitwise AND. 2719 /// \param __b 2720 /// A 128-bit vector containing the right source operand. 2721 /// \returns A 128-bit integer vector containing the bitwise AND of the one's 2722 /// complement of the first operand and the values in the second operand. 2723 static __inline__ __m128i __DEFAULT_FN_ATTRS 2724 _mm_andnot_si128(__m128i __a, __m128i __b) 2725 { 2726 return (__m128i)(~(__v2du)__a & (__v2du)__b); 2727 } 2728 /// \brief Performs a bitwise OR of two 128-bit integer vectors. 2729 /// 2730 /// \headerfile <x86intrin.h> 2731 /// 2732 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 2733 /// 2734 /// \param __a 2735 /// A 128-bit integer vector containing one of the source operands. 2736 /// \param __b 2737 /// A 128-bit integer vector containing one of the source operands. 2738 /// \returns A 128-bit integer vector containing the bitwise OR of the values 2739 /// in both operands. 2740 static __inline__ __m128i __DEFAULT_FN_ATTRS 2741 _mm_or_si128(__m128i __a, __m128i __b) 2742 { 2743 return (__m128i)((__v2du)__a | (__v2du)__b); 2744 } 2745 2746 /// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors. 2747 /// 2748 /// \headerfile <x86intrin.h> 2749 /// 2750 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 2751 /// 2752 /// \param __a 2753 /// A 128-bit integer vector containing one of the source operands. 2754 /// \param __b 2755 /// A 128-bit integer vector containing one of the source operands. 2756 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 2757 /// values in both operands. 2758 static __inline__ __m128i __DEFAULT_FN_ATTRS 2759 _mm_xor_si128(__m128i __a, __m128i __b) 2760 { 2761 return (__m128i)((__v2du)__a ^ (__v2du)__b); 2762 } 2763 2764 /// \brief Left-shifts the 128-bit integer vector operand by the specified 2765 /// number of bytes. Low-order bits are cleared. 2766 /// 2767 /// \headerfile <x86intrin.h> 2768 /// 2769 /// \code 2770 /// __m128i _mm_slli_si128(__m128i a, const int imm); 2771 /// \endcode 2772 /// 2773 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction. 2774 /// 2775 /// \param a 2776 /// A 128-bit integer vector containing the source operand. 2777 /// \param imm 2778 /// An immediate value specifying the number of bytes to left-shift operand 2779 /// \a a. 2780 /// \returns A 128-bit integer vector containing the left-shifted value. 2781 #define _mm_slli_si128(a, imm) __extension__ ({ \ 2782 (__m128i)__builtin_shufflevector( \ 2783 (__v16qi)_mm_setzero_si128(), \ 2784 (__v16qi)(__m128i)(a), \ 2785 ((char)(imm)&0xF0) ? 0 : 16 - (char)(imm), \ 2786 ((char)(imm)&0xF0) ? 1 : 17 - (char)(imm), \ 2787 ((char)(imm)&0xF0) ? 2 : 18 - (char)(imm), \ 2788 ((char)(imm)&0xF0) ? 3 : 19 - (char)(imm), \ 2789 ((char)(imm)&0xF0) ? 4 : 20 - (char)(imm), \ 2790 ((char)(imm)&0xF0) ? 5 : 21 - (char)(imm), \ 2791 ((char)(imm)&0xF0) ? 6 : 22 - (char)(imm), \ 2792 ((char)(imm)&0xF0) ? 7 : 23 - (char)(imm), \ 2793 ((char)(imm)&0xF0) ? 8 : 24 - (char)(imm), \ 2794 ((char)(imm)&0xF0) ? 9 : 25 - (char)(imm), \ 2795 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \ 2796 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \ 2797 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \ 2798 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \ 2799 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \ 2800 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); }) 2801 2802 #define _mm_bslli_si128(a, imm) \ 2803 _mm_slli_si128((a), (imm)) 2804 2805 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 2806 /// by the specified number of bits. Low-order bits are cleared. 2807 /// 2808 /// \headerfile <x86intrin.h> 2809 /// 2810 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2811 /// 2812 /// \param __a 2813 /// A 128-bit integer vector containing the source operand. 2814 /// \param __count 2815 /// An integer value specifying the number of bits to left-shift each value 2816 /// in operand \a __a. 2817 /// \returns A 128-bit integer vector containing the left-shifted values. 2818 static __inline__ __m128i __DEFAULT_FN_ATTRS 2819 _mm_slli_epi16(__m128i __a, int __count) 2820 { 2821 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 2822 } 2823 2824 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 2825 /// by the specified number of bits. Low-order bits are cleared. 2826 /// 2827 /// \headerfile <x86intrin.h> 2828 /// 2829 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2830 /// 2831 /// \param __a 2832 /// A 128-bit integer vector containing the source operand. 2833 /// \param __count 2834 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2835 /// to left-shift each value in operand \a __a. 2836 /// \returns A 128-bit integer vector containing the left-shifted values. 2837 static __inline__ __m128i __DEFAULT_FN_ATTRS 2838 _mm_sll_epi16(__m128i __a, __m128i __count) 2839 { 2840 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 2841 } 2842 2843 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 2844 /// by the specified number of bits. Low-order bits are cleared. 2845 /// 2846 /// \headerfile <x86intrin.h> 2847 /// 2848 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2849 /// 2850 /// \param __a 2851 /// A 128-bit integer vector containing the source operand. 2852 /// \param __count 2853 /// An integer value specifying the number of bits to left-shift each value 2854 /// in operand \a __a. 2855 /// \returns A 128-bit integer vector containing the left-shifted values. 2856 static __inline__ __m128i __DEFAULT_FN_ATTRS 2857 _mm_slli_epi32(__m128i __a, int __count) 2858 { 2859 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 2860 } 2861 2862 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 2863 /// by the specified number of bits. Low-order bits are cleared. 2864 /// 2865 /// \headerfile <x86intrin.h> 2866 /// 2867 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2868 /// 2869 /// \param __a 2870 /// A 128-bit integer vector containing the source operand. 2871 /// \param __count 2872 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2873 /// to left-shift each value in operand \a __a. 2874 /// \returns A 128-bit integer vector containing the left-shifted values. 2875 static __inline__ __m128i __DEFAULT_FN_ATTRS 2876 _mm_sll_epi32(__m128i __a, __m128i __count) 2877 { 2878 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 2879 } 2880 2881 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 2882 /// by the specified number of bits. Low-order bits are cleared. 2883 /// 2884 /// \headerfile <x86intrin.h> 2885 /// 2886 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2887 /// 2888 /// \param __a 2889 /// A 128-bit integer vector containing the source operand. 2890 /// \param __count 2891 /// An integer value specifying the number of bits to left-shift each value 2892 /// in operand \a __a. 2893 /// \returns A 128-bit integer vector containing the left-shifted values. 2894 static __inline__ __m128i __DEFAULT_FN_ATTRS 2895 _mm_slli_epi64(__m128i __a, int __count) 2896 { 2897 return __builtin_ia32_psllqi128((__v2di)__a, __count); 2898 } 2899 2900 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 2901 /// by the specified number of bits. Low-order bits are cleared. 2902 /// 2903 /// \headerfile <x86intrin.h> 2904 /// 2905 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2906 /// 2907 /// \param __a 2908 /// A 128-bit integer vector containing the source operand. 2909 /// \param __count 2910 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2911 /// to left-shift each value in operand \a __a. 2912 /// \returns A 128-bit integer vector containing the left-shifted values. 2913 static __inline__ __m128i __DEFAULT_FN_ATTRS 2914 _mm_sll_epi64(__m128i __a, __m128i __count) 2915 { 2916 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 2917 } 2918 2919 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 2920 /// by the specified number of bits. High-order bits are filled with the sign 2921 /// bit of the initial value. 2922 /// 2923 /// \headerfile <x86intrin.h> 2924 /// 2925 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2926 /// 2927 /// \param __a 2928 /// A 128-bit integer vector containing the source operand. 2929 /// \param __count 2930 /// An integer value specifying the number of bits to right-shift each value 2931 /// in operand \a __a. 2932 /// \returns A 128-bit integer vector containing the right-shifted values. 2933 static __inline__ __m128i __DEFAULT_FN_ATTRS 2934 _mm_srai_epi16(__m128i __a, int __count) 2935 { 2936 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 2937 } 2938 2939 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 2940 /// by the specified number of bits. High-order bits are filled with the sign 2941 /// bit of the initial value. 2942 /// 2943 /// \headerfile <x86intrin.h> 2944 /// 2945 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2946 /// 2947 /// \param __a 2948 /// A 128-bit integer vector containing the source operand. 2949 /// \param __count 2950 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2951 /// to right-shift each value in operand \a __a. 2952 /// \returns A 128-bit integer vector containing the right-shifted values. 2953 static __inline__ __m128i __DEFAULT_FN_ATTRS 2954 _mm_sra_epi16(__m128i __a, __m128i __count) 2955 { 2956 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 2957 } 2958 2959 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 2960 /// by the specified number of bits. High-order bits are filled with the sign 2961 /// bit of the initial value. 2962 /// 2963 /// \headerfile <x86intrin.h> 2964 /// 2965 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2966 /// 2967 /// \param __a 2968 /// A 128-bit integer vector containing the source operand. 2969 /// \param __count 2970 /// An integer value specifying the number of bits to right-shift each value 2971 /// in operand \a __a. 2972 /// \returns A 128-bit integer vector containing the right-shifted values. 2973 static __inline__ __m128i __DEFAULT_FN_ATTRS 2974 _mm_srai_epi32(__m128i __a, int __count) 2975 { 2976 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 2977 } 2978 2979 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 2980 /// by the specified number of bits. High-order bits are filled with the sign 2981 /// bit of the initial value. 2982 /// 2983 /// \headerfile <x86intrin.h> 2984 /// 2985 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2986 /// 2987 /// \param __a 2988 /// A 128-bit integer vector containing the source operand. 2989 /// \param __count 2990 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2991 /// to right-shift each value in operand \a __a. 2992 /// \returns A 128-bit integer vector containing the right-shifted values. 2993 static __inline__ __m128i __DEFAULT_FN_ATTRS 2994 _mm_sra_epi32(__m128i __a, __m128i __count) 2995 { 2996 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 2997 } 2998 2999 /// \brief Right-shifts the 128-bit integer vector operand by the specified 3000 /// number of bytes. High-order bits are cleared. 3001 /// 3002 /// \headerfile <x86intrin.h> 3003 /// 3004 /// \code 3005 /// __m128i _mm_srli_si128(__m128i a, const int imm); 3006 /// \endcode 3007 /// 3008 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction. 3009 /// 3010 /// \param a 3011 /// A 128-bit integer vector containing the source operand. 3012 /// \param imm 3013 /// An immediate value specifying the number of bytes to right-shift operand 3014 /// \a a. 3015 /// \returns A 128-bit integer vector containing the right-shifted value. 3016 #define _mm_srli_si128(a, imm) __extension__ ({ \ 3017 (__m128i)__builtin_shufflevector( \ 3018 (__v16qi)(__m128i)(a), \ 3019 (__v16qi)_mm_setzero_si128(), \ 3020 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0, \ 3021 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1, \ 3022 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2, \ 3023 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3, \ 3024 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4, \ 3025 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5, \ 3026 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6, \ 3027 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7, \ 3028 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8, \ 3029 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9, \ 3030 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \ 3031 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \ 3032 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \ 3033 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \ 3034 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \ 3035 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); }) 3036 3037 #define _mm_bsrli_si128(a, imm) \ 3038 _mm_srli_si128((a), (imm)) 3039 3040 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 3041 /// operand by the specified number of bits. High-order bits are cleared. 3042 /// 3043 /// \headerfile <x86intrin.h> 3044 /// 3045 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 3046 /// 3047 /// \param __a 3048 /// A 128-bit integer vector containing the source operand. 3049 /// \param __count 3050 /// An integer value specifying the number of bits to right-shift each value 3051 /// in operand \a __a. 3052 /// \returns A 128-bit integer vector containing the right-shifted values. 3053 static __inline__ __m128i __DEFAULT_FN_ATTRS 3054 _mm_srli_epi16(__m128i __a, int __count) 3055 { 3056 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 3057 } 3058 3059 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 3060 /// operand by the specified number of bits. High-order bits are cleared. 3061 /// 3062 /// \headerfile <x86intrin.h> 3063 /// 3064 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 3065 /// 3066 /// \param __a 3067 /// A 128-bit integer vector containing the source operand. 3068 /// \param __count 3069 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3070 /// to right-shift each value in operand \a __a. 3071 /// \returns A 128-bit integer vector containing the right-shifted values. 3072 static __inline__ __m128i __DEFAULT_FN_ATTRS 3073 _mm_srl_epi16(__m128i __a, __m128i __count) 3074 { 3075 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 3076 } 3077 3078 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 3079 /// operand by the specified number of bits. High-order bits are cleared. 3080 /// 3081 /// \headerfile <x86intrin.h> 3082 /// 3083 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3084 /// 3085 /// \param __a 3086 /// A 128-bit integer vector containing the source operand. 3087 /// \param __count 3088 /// An integer value specifying the number of bits to right-shift each value 3089 /// in operand \a __a. 3090 /// \returns A 128-bit integer vector containing the right-shifted values. 3091 static __inline__ __m128i __DEFAULT_FN_ATTRS 3092 _mm_srli_epi32(__m128i __a, int __count) 3093 { 3094 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 3095 } 3096 3097 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 3098 /// operand by the specified number of bits. High-order bits are cleared. 3099 /// 3100 /// \headerfile <x86intrin.h> 3101 /// 3102 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3103 /// 3104 /// \param __a 3105 /// A 128-bit integer vector containing the source operand. 3106 /// \param __count 3107 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3108 /// to right-shift each value in operand \a __a. 3109 /// \returns A 128-bit integer vector containing the right-shifted values. 3110 static __inline__ __m128i __DEFAULT_FN_ATTRS 3111 _mm_srl_epi32(__m128i __a, __m128i __count) 3112 { 3113 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 3114 } 3115 3116 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 3117 /// operand by the specified number of bits. High-order bits are cleared. 3118 /// 3119 /// \headerfile <x86intrin.h> 3120 /// 3121 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3122 /// 3123 /// \param __a 3124 /// A 128-bit integer vector containing the source operand. 3125 /// \param __count 3126 /// An integer value specifying the number of bits to right-shift each value 3127 /// in operand \a __a. 3128 /// \returns A 128-bit integer vector containing the right-shifted values. 3129 static __inline__ __m128i __DEFAULT_FN_ATTRS 3130 _mm_srli_epi64(__m128i __a, int __count) 3131 { 3132 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 3133 } 3134 3135 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 3136 /// operand by the specified number of bits. High-order bits are cleared. 3137 /// 3138 /// \headerfile <x86intrin.h> 3139 /// 3140 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3141 /// 3142 /// \param __a 3143 /// A 128-bit integer vector containing the source operand. 3144 /// \param __count 3145 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3146 /// to right-shift each value in operand \a __a. 3147 /// \returns A 128-bit integer vector containing the right-shifted values. 3148 static __inline__ __m128i __DEFAULT_FN_ATTRS 3149 _mm_srl_epi64(__m128i __a, __m128i __count) 3150 { 3151 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 3152 } 3153 3154 /// \brief Compares each of the corresponding 8-bit values of the 128-bit 3155 /// integer vectors for equality. Each comparison yields 0h for false, FFh 3156 /// for true. 3157 /// 3158 /// \headerfile <x86intrin.h> 3159 /// 3160 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction. 3161 /// 3162 /// \param __a 3163 /// A 128-bit integer vector. 3164 /// \param __b 3165 /// A 128-bit integer vector. 3166 /// \returns A 128-bit integer vector containing the comparison results. 3167 static __inline__ __m128i __DEFAULT_FN_ATTRS 3168 _mm_cmpeq_epi8(__m128i __a, __m128i __b) 3169 { 3170 return (__m128i)((__v16qi)__a == (__v16qi)__b); 3171 } 3172 3173 /// \brief Compares each of the corresponding 16-bit values of the 128-bit 3174 /// integer vectors for equality. Each comparison yields 0h for false, FFFFh 3175 /// for true. 3176 /// 3177 /// \headerfile <x86intrin.h> 3178 /// 3179 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction. 3180 /// 3181 /// \param __a 3182 /// A 128-bit integer vector. 3183 /// \param __b 3184 /// A 128-bit integer vector. 3185 /// \returns A 128-bit integer vector containing the comparison results. 3186 static __inline__ __m128i __DEFAULT_FN_ATTRS 3187 _mm_cmpeq_epi16(__m128i __a, __m128i __b) 3188 { 3189 return (__m128i)((__v8hi)__a == (__v8hi)__b); 3190 } 3191 3192 /// \brief Compares each of the corresponding 32-bit values of the 128-bit 3193 /// integer vectors for equality. Each comparison yields 0h for false, 3194 /// FFFFFFFFh for true. 3195 /// 3196 /// \headerfile <x86intrin.h> 3197 /// 3198 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction. 3199 /// 3200 /// \param __a 3201 /// A 128-bit integer vector. 3202 /// \param __b 3203 /// A 128-bit integer vector. 3204 /// \returns A 128-bit integer vector containing the comparison results. 3205 static __inline__ __m128i __DEFAULT_FN_ATTRS 3206 _mm_cmpeq_epi32(__m128i __a, __m128i __b) 3207 { 3208 return (__m128i)((__v4si)__a == (__v4si)__b); 3209 } 3210 3211 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 3212 /// integer vectors to determine if the values in the first operand are 3213 /// greater than those in the second operand. Each comparison yields 0h for 3214 /// false, FFh for true. 3215 /// 3216 /// \headerfile <x86intrin.h> 3217 /// 3218 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3219 /// 3220 /// \param __a 3221 /// A 128-bit integer vector. 3222 /// \param __b 3223 /// A 128-bit integer vector. 3224 /// \returns A 128-bit integer vector containing the comparison results. 3225 static __inline__ __m128i __DEFAULT_FN_ATTRS 3226 _mm_cmpgt_epi8(__m128i __a, __m128i __b) 3227 { 3228 /* This function always performs a signed comparison, but __v16qi is a char 3229 which may be signed or unsigned, so use __v16qs. */ 3230 return (__m128i)((__v16qs)__a > (__v16qs)__b); 3231 } 3232 3233 /// \brief Compares each of the corresponding signed 16-bit values of the 3234 /// 128-bit integer vectors to determine if the values in the first operand 3235 /// are greater than those in the second operand. 3236 /// 3237 /// Each comparison yields 0h for false, FFFFh for true. 3238 /// 3239 /// \headerfile <x86intrin.h> 3240 /// 3241 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3242 /// 3243 /// \param __a 3244 /// A 128-bit integer vector. 3245 /// \param __b 3246 /// A 128-bit integer vector. 3247 /// \returns A 128-bit integer vector containing the comparison results. 3248 static __inline__ __m128i __DEFAULT_FN_ATTRS 3249 _mm_cmpgt_epi16(__m128i __a, __m128i __b) 3250 { 3251 return (__m128i)((__v8hi)__a > (__v8hi)__b); 3252 } 3253 3254 /// \brief Compares each of the corresponding signed 32-bit values of the 3255 /// 128-bit integer vectors to determine if the values in the first operand 3256 /// are greater than those in the second operand. 3257 /// 3258 /// Each comparison yields 0h for false, FFFFFFFFh for true. 3259 /// 3260 /// \headerfile <x86intrin.h> 3261 /// 3262 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3263 /// 3264 /// \param __a 3265 /// A 128-bit integer vector. 3266 /// \param __b 3267 /// A 128-bit integer vector. 3268 /// \returns A 128-bit integer vector containing the comparison results. 3269 static __inline__ __m128i __DEFAULT_FN_ATTRS 3270 _mm_cmpgt_epi32(__m128i __a, __m128i __b) 3271 { 3272 return (__m128i)((__v4si)__a > (__v4si)__b); 3273 } 3274 3275 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 3276 /// integer vectors to determine if the values in the first operand are less 3277 /// than those in the second operand. 3278 /// 3279 /// Each comparison yields 0h for false, FFh for true. 3280 /// 3281 /// \headerfile <x86intrin.h> 3282 /// 3283 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3284 /// 3285 /// \param __a 3286 /// A 128-bit integer vector. 3287 /// \param __b 3288 /// A 128-bit integer vector. 3289 /// \returns A 128-bit integer vector containing the comparison results. 3290 static __inline__ __m128i __DEFAULT_FN_ATTRS 3291 _mm_cmplt_epi8(__m128i __a, __m128i __b) 3292 { 3293 return _mm_cmpgt_epi8(__b, __a); 3294 } 3295 3296 /// \brief Compares each of the corresponding signed 16-bit values of the 3297 /// 128-bit integer vectors to determine if the values in the first operand 3298 /// are less than those in the second operand. 3299 /// 3300 /// Each comparison yields 0h for false, FFFFh for true. 3301 /// 3302 /// \headerfile <x86intrin.h> 3303 /// 3304 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3305 /// 3306 /// \param __a 3307 /// A 128-bit integer vector. 3308 /// \param __b 3309 /// A 128-bit integer vector. 3310 /// \returns A 128-bit integer vector containing the comparison results. 3311 static __inline__ __m128i __DEFAULT_FN_ATTRS 3312 _mm_cmplt_epi16(__m128i __a, __m128i __b) 3313 { 3314 return _mm_cmpgt_epi16(__b, __a); 3315 } 3316 3317 /// \brief Compares each of the corresponding signed 32-bit values of the 3318 /// 128-bit integer vectors to determine if the values in the first operand 3319 /// are less than those in the second operand. 3320 /// 3321 /// Each comparison yields 0h for false, FFFFFFFFh for true. 3322 /// 3323 /// \headerfile <x86intrin.h> 3324 /// 3325 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3326 /// 3327 /// \param __a 3328 /// A 128-bit integer vector. 3329 /// \param __b 3330 /// A 128-bit integer vector. 3331 /// \returns A 128-bit integer vector containing the comparison results. 3332 static __inline__ __m128i __DEFAULT_FN_ATTRS 3333 _mm_cmplt_epi32(__m128i __a, __m128i __b) 3334 { 3335 return _mm_cmpgt_epi32(__b, __a); 3336 } 3337 3338 #ifdef __x86_64__ 3339 /// \brief Converts a 64-bit signed integer value from the second operand into a 3340 /// double-precision value and returns it in the lower element of a [2 x 3341 /// double] vector; the upper element of the returned vector is copied from 3342 /// the upper element of the first operand. 3343 /// 3344 /// \headerfile <x86intrin.h> 3345 /// 3346 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 3347 /// 3348 /// \param __a 3349 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 3350 /// copied to the upper 64 bits of the destination. 3351 /// \param __b 3352 /// A 64-bit signed integer operand containing the value to be converted. 3353 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 3354 /// converted value of the second operand. The upper 64 bits are copied from 3355 /// the upper 64 bits of the first operand. 3356 static __inline__ __m128d __DEFAULT_FN_ATTRS 3357 _mm_cvtsi64_sd(__m128d __a, long long __b) 3358 { 3359 __a[0] = __b; 3360 return __a; 3361 } 3362 3363 /// \brief Converts the first (lower) element of a vector of [2 x double] into a 3364 /// 64-bit signed integer value, according to the current rounding mode. 3365 /// 3366 /// \headerfile <x86intrin.h> 3367 /// 3368 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 3369 /// 3370 /// \param __a 3371 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3372 /// conversion. 3373 /// \returns A 64-bit signed integer containing the converted value. 3374 static __inline__ long long __DEFAULT_FN_ATTRS 3375 _mm_cvtsd_si64(__m128d __a) 3376 { 3377 return __builtin_ia32_cvtsd2si64((__v2df)__a); 3378 } 3379 3380 /// \brief Converts the first (lower) element of a vector of [2 x double] into a 3381 /// 64-bit signed integer value, truncating the result when it is inexact. 3382 /// 3383 /// \headerfile <x86intrin.h> 3384 /// 3385 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 3386 /// instruction. 3387 /// 3388 /// \param __a 3389 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3390 /// conversion. 3391 /// \returns A 64-bit signed integer containing the converted value. 3392 static __inline__ long long __DEFAULT_FN_ATTRS 3393 _mm_cvttsd_si64(__m128d __a) 3394 { 3395 return __builtin_ia32_cvttsd2si64((__v2df)__a); 3396 } 3397 #endif 3398 3399 /// \brief Converts a vector of [4 x i32] into a vector of [4 x float]. 3400 /// 3401 /// \headerfile <x86intrin.h> 3402 /// 3403 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction. 3404 /// 3405 /// \param __a 3406 /// A 128-bit integer vector. 3407 /// \returns A 128-bit vector of [4 x float] containing the converted values. 3408 static __inline__ __m128 __DEFAULT_FN_ATTRS 3409 _mm_cvtepi32_ps(__m128i __a) 3410 { 3411 return __builtin_ia32_cvtdq2ps((__v4si)__a); 3412 } 3413 3414 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32]. 3415 /// 3416 /// \headerfile <x86intrin.h> 3417 /// 3418 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction. 3419 /// 3420 /// \param __a 3421 /// A 128-bit vector of [4 x float]. 3422 /// \returns A 128-bit integer vector of [4 x i32] containing the converted 3423 /// values. 3424 static __inline__ __m128i __DEFAULT_FN_ATTRS 3425 _mm_cvtps_epi32(__m128 __a) 3426 { 3427 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 3428 } 3429 3430 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32], 3431 /// truncating the result when it is inexact. 3432 /// 3433 /// \headerfile <x86intrin.h> 3434 /// 3435 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c> 3436 /// instruction. 3437 /// 3438 /// \param __a 3439 /// A 128-bit vector of [4 x float]. 3440 /// \returns A 128-bit vector of [4 x i32] containing the converted values. 3441 static __inline__ __m128i __DEFAULT_FN_ATTRS 3442 _mm_cvttps_epi32(__m128 __a) 3443 { 3444 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 3445 } 3446 3447 /// \brief Returns a vector of [4 x i32] where the lowest element is the input 3448 /// operand and the remaining elements are zero. 3449 /// 3450 /// \headerfile <x86intrin.h> 3451 /// 3452 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3453 /// 3454 /// \param __a 3455 /// A 32-bit signed integer operand. 3456 /// \returns A 128-bit vector of [4 x i32]. 3457 static __inline__ __m128i __DEFAULT_FN_ATTRS 3458 _mm_cvtsi32_si128(int __a) 3459 { 3460 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 3461 } 3462 3463 #ifdef __x86_64__ 3464 /// \brief Returns a vector of [2 x i64] where the lower element is the input 3465 /// operand and the upper element is zero. 3466 /// 3467 /// \headerfile <x86intrin.h> 3468 /// 3469 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3470 /// 3471 /// \param __a 3472 /// A 64-bit signed integer operand containing the value to be converted. 3473 /// \returns A 128-bit vector of [2 x i64] containing the converted value. 3474 static __inline__ __m128i __DEFAULT_FN_ATTRS 3475 _mm_cvtsi64_si128(long long __a) 3476 { 3477 return (__m128i){ __a, 0 }; 3478 } 3479 #endif 3480 3481 /// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a 3482 /// 32-bit signed integer value. 3483 /// 3484 /// \headerfile <x86intrin.h> 3485 /// 3486 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3487 /// 3488 /// \param __a 3489 /// A vector of [4 x i32]. The least significant 32 bits are moved to the 3490 /// destination. 3491 /// \returns A 32-bit signed integer containing the moved value. 3492 static __inline__ int __DEFAULT_FN_ATTRS 3493 _mm_cvtsi128_si32(__m128i __a) 3494 { 3495 __v4si __b = (__v4si)__a; 3496 return __b[0]; 3497 } 3498 3499 #ifdef __x86_64__ 3500 /// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a 3501 /// 64-bit signed integer value. 3502 /// 3503 /// \headerfile <x86intrin.h> 3504 /// 3505 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3506 /// 3507 /// \param __a 3508 /// A vector of [2 x i64]. The least significant 64 bits are moved to the 3509 /// destination. 3510 /// \returns A 64-bit signed integer containing the moved value. 3511 static __inline__ long long __DEFAULT_FN_ATTRS 3512 _mm_cvtsi128_si64(__m128i __a) 3513 { 3514 return __a[0]; 3515 } 3516 #endif 3517 3518 /// \brief Moves packed integer values from an aligned 128-bit memory location 3519 /// to elements in a 128-bit integer vector. 3520 /// 3521 /// \headerfile <x86intrin.h> 3522 /// 3523 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction. 3524 /// 3525 /// \param __p 3526 /// An aligned pointer to a memory location containing integer values. 3527 /// \returns A 128-bit integer vector containing the moved values. 3528 static __inline__ __m128i __DEFAULT_FN_ATTRS 3529 _mm_load_si128(__m128i const *__p) 3530 { 3531 return *__p; 3532 } 3533 3534 /// \brief Moves packed integer values from an unaligned 128-bit memory location 3535 /// to elements in a 128-bit integer vector. 3536 /// 3537 /// \headerfile <x86intrin.h> 3538 /// 3539 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction. 3540 /// 3541 /// \param __p 3542 /// A pointer to a memory location containing integer values. 3543 /// \returns A 128-bit integer vector containing the moved values. 3544 static __inline__ __m128i __DEFAULT_FN_ATTRS 3545 _mm_loadu_si128(__m128i const *__p) 3546 { 3547 struct __loadu_si128 { 3548 __m128i __v; 3549 } __attribute__((__packed__, __may_alias__)); 3550 return ((struct __loadu_si128*)__p)->__v; 3551 } 3552 3553 /// \brief Returns a vector of [2 x i64] where the lower element is taken from 3554 /// the lower element of the operand, and the upper element is zero. 3555 /// 3556 /// \headerfile <x86intrin.h> 3557 /// 3558 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3559 /// 3560 /// \param __p 3561 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 3562 /// the destination. 3563 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 3564 /// moved value. The higher order bits are cleared. 3565 static __inline__ __m128i __DEFAULT_FN_ATTRS 3566 _mm_loadl_epi64(__m128i const *__p) 3567 { 3568 struct __mm_loadl_epi64_struct { 3569 long long __u; 3570 } __attribute__((__packed__, __may_alias__)); 3571 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 3572 } 3573 3574 /// \brief Generates a 128-bit vector of [4 x i32] with unspecified content. 3575 /// This could be used as an argument to another intrinsic function where the 3576 /// argument is required but the value is not actually used. 3577 /// 3578 /// \headerfile <x86intrin.h> 3579 /// 3580 /// This intrinsic has no corresponding instruction. 3581 /// 3582 /// \returns A 128-bit vector of [4 x i32] with unspecified content. 3583 static __inline__ __m128i __DEFAULT_FN_ATTRS 3584 _mm_undefined_si128(void) 3585 { 3586 return (__m128i)__builtin_ia32_undef128(); 3587 } 3588 3589 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3590 /// the specified 64-bit integer values. 3591 /// 3592 /// \headerfile <x86intrin.h> 3593 /// 3594 /// This intrinsic is a utility function and does not correspond to a specific 3595 /// instruction. 3596 /// 3597 /// \param __q1 3598 /// A 64-bit integer value used to initialize the upper 64 bits of the 3599 /// destination vector of [2 x i64]. 3600 /// \param __q0 3601 /// A 64-bit integer value used to initialize the lower 64 bits of the 3602 /// destination vector of [2 x i64]. 3603 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3604 /// provided in the operands. 3605 static __inline__ __m128i __DEFAULT_FN_ATTRS 3606 _mm_set_epi64x(long long __q1, long long __q0) 3607 { 3608 return (__m128i){ __q0, __q1 }; 3609 } 3610 3611 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3612 /// the specified 64-bit integer values. 3613 /// 3614 /// \headerfile <x86intrin.h> 3615 /// 3616 /// This intrinsic is a utility function and does not correspond to a specific 3617 /// instruction. 3618 /// 3619 /// \param __q1 3620 /// A 64-bit integer value used to initialize the upper 64 bits of the 3621 /// destination vector of [2 x i64]. 3622 /// \param __q0 3623 /// A 64-bit integer value used to initialize the lower 64 bits of the 3624 /// destination vector of [2 x i64]. 3625 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3626 /// provided in the operands. 3627 static __inline__ __m128i __DEFAULT_FN_ATTRS 3628 _mm_set_epi64(__m64 __q1, __m64 __q0) 3629 { 3630 return (__m128i){ (long long)__q0, (long long)__q1 }; 3631 } 3632 3633 /// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 3634 /// the specified 32-bit integer values. 3635 /// 3636 /// \headerfile <x86intrin.h> 3637 /// 3638 /// This intrinsic is a utility function and does not correspond to a specific 3639 /// instruction. 3640 /// 3641 /// \param __i3 3642 /// A 32-bit integer value used to initialize bits [127:96] of the 3643 /// destination vector. 3644 /// \param __i2 3645 /// A 32-bit integer value used to initialize bits [95:64] of the destination 3646 /// vector. 3647 /// \param __i1 3648 /// A 32-bit integer value used to initialize bits [63:32] of the destination 3649 /// vector. 3650 /// \param __i0 3651 /// A 32-bit integer value used to initialize bits [31:0] of the destination 3652 /// vector. 3653 /// \returns An initialized 128-bit vector of [4 x i32] containing the values 3654 /// provided in the operands. 3655 static __inline__ __m128i __DEFAULT_FN_ATTRS 3656 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 3657 { 3658 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 3659 } 3660 3661 /// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 3662 /// the specified 16-bit integer values. 3663 /// 3664 /// \headerfile <x86intrin.h> 3665 /// 3666 /// This intrinsic is a utility function and does not correspond to a specific 3667 /// instruction. 3668 /// 3669 /// \param __w7 3670 /// A 16-bit integer value used to initialize bits [127:112] of the 3671 /// destination vector. 3672 /// \param __w6 3673 /// A 16-bit integer value used to initialize bits [111:96] of the 3674 /// destination vector. 3675 /// \param __w5 3676 /// A 16-bit integer value used to initialize bits [95:80] of the destination 3677 /// vector. 3678 /// \param __w4 3679 /// A 16-bit integer value used to initialize bits [79:64] of the destination 3680 /// vector. 3681 /// \param __w3 3682 /// A 16-bit integer value used to initialize bits [63:48] of the destination 3683 /// vector. 3684 /// \param __w2 3685 /// A 16-bit integer value used to initialize bits [47:32] of the destination 3686 /// vector. 3687 /// \param __w1 3688 /// A 16-bit integer value used to initialize bits [31:16] of the destination 3689 /// vector. 3690 /// \param __w0 3691 /// A 16-bit integer value used to initialize bits [15:0] of the destination 3692 /// vector. 3693 /// \returns An initialized 128-bit vector of [8 x i16] containing the values 3694 /// provided in the operands. 3695 static __inline__ __m128i __DEFAULT_FN_ATTRS 3696 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 3697 { 3698 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 3699 } 3700 3701 /// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 3702 /// the specified 8-bit integer values. 3703 /// 3704 /// \headerfile <x86intrin.h> 3705 /// 3706 /// This intrinsic is a utility function and does not correspond to a specific 3707 /// instruction. 3708 /// 3709 /// \param __b15 3710 /// Initializes bits [127:120] of the destination vector. 3711 /// \param __b14 3712 /// Initializes bits [119:112] of the destination vector. 3713 /// \param __b13 3714 /// Initializes bits [111:104] of the destination vector. 3715 /// \param __b12 3716 /// Initializes bits [103:96] of the destination vector. 3717 /// \param __b11 3718 /// Initializes bits [95:88] of the destination vector. 3719 /// \param __b10 3720 /// Initializes bits [87:80] of the destination vector. 3721 /// \param __b9 3722 /// Initializes bits [79:72] of the destination vector. 3723 /// \param __b8 3724 /// Initializes bits [71:64] of the destination vector. 3725 /// \param __b7 3726 /// Initializes bits [63:56] of the destination vector. 3727 /// \param __b6 3728 /// Initializes bits [55:48] of the destination vector. 3729 /// \param __b5 3730 /// Initializes bits [47:40] of the destination vector. 3731 /// \param __b4 3732 /// Initializes bits [39:32] of the destination vector. 3733 /// \param __b3 3734 /// Initializes bits [31:24] of the destination vector. 3735 /// \param __b2 3736 /// Initializes bits [23:16] of the destination vector. 3737 /// \param __b1 3738 /// Initializes bits [15:8] of the destination vector. 3739 /// \param __b0 3740 /// Initializes bits [7:0] of the destination vector. 3741 /// \returns An initialized 128-bit vector of [16 x i8] containing the values 3742 /// provided in the operands. 3743 static __inline__ __m128i __DEFAULT_FN_ATTRS 3744 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 3745 { 3746 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 3747 } 3748 3749 /// \brief Initializes both values in a 128-bit integer vector with the 3750 /// specified 64-bit integer value. 3751 /// 3752 /// \headerfile <x86intrin.h> 3753 /// 3754 /// This intrinsic is a utility function and does not correspond to a specific 3755 /// instruction. 3756 /// 3757 /// \param __q 3758 /// Integer value used to initialize the elements of the destination integer 3759 /// vector. 3760 /// \returns An initialized 128-bit integer vector of [2 x i64] with both 3761 /// elements containing the value provided in the operand. 3762 static __inline__ __m128i __DEFAULT_FN_ATTRS 3763 _mm_set1_epi64x(long long __q) 3764 { 3765 return (__m128i){ __q, __q }; 3766 } 3767 3768 /// \brief Initializes both values in a 128-bit vector of [2 x i64] with the 3769 /// specified 64-bit value. 3770 /// 3771 /// \headerfile <x86intrin.h> 3772 /// 3773 /// This intrinsic is a utility function and does not correspond to a specific 3774 /// instruction. 3775 /// 3776 /// \param __q 3777 /// A 64-bit value used to initialize the elements of the destination integer 3778 /// vector. 3779 /// \returns An initialized 128-bit vector of [2 x i64] with all elements 3780 /// containing the value provided in the operand. 3781 static __inline__ __m128i __DEFAULT_FN_ATTRS 3782 _mm_set1_epi64(__m64 __q) 3783 { 3784 return (__m128i){ (long long)__q, (long long)__q }; 3785 } 3786 3787 /// \brief Initializes all values in a 128-bit vector of [4 x i32] with the 3788 /// specified 32-bit value. 3789 /// 3790 /// \headerfile <x86intrin.h> 3791 /// 3792 /// This intrinsic is a utility function and does not correspond to a specific 3793 /// instruction. 3794 /// 3795 /// \param __i 3796 /// A 32-bit value used to initialize the elements of the destination integer 3797 /// vector. 3798 /// \returns An initialized 128-bit vector of [4 x i32] with all elements 3799 /// containing the value provided in the operand. 3800 static __inline__ __m128i __DEFAULT_FN_ATTRS 3801 _mm_set1_epi32(int __i) 3802 { 3803 return (__m128i)(__v4si){ __i, __i, __i, __i }; 3804 } 3805 3806 /// \brief Initializes all values in a 128-bit vector of [8 x i16] with the 3807 /// specified 16-bit value. 3808 /// 3809 /// \headerfile <x86intrin.h> 3810 /// 3811 /// This intrinsic is a utility function and does not correspond to a specific 3812 /// instruction. 3813 /// 3814 /// \param __w 3815 /// A 16-bit value used to initialize the elements of the destination integer 3816 /// vector. 3817 /// \returns An initialized 128-bit vector of [8 x i16] with all elements 3818 /// containing the value provided in the operand. 3819 static __inline__ __m128i __DEFAULT_FN_ATTRS 3820 _mm_set1_epi16(short __w) 3821 { 3822 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 3823 } 3824 3825 /// \brief Initializes all values in a 128-bit vector of [16 x i8] with the 3826 /// specified 8-bit value. 3827 /// 3828 /// \headerfile <x86intrin.h> 3829 /// 3830 /// This intrinsic is a utility function and does not correspond to a specific 3831 /// instruction. 3832 /// 3833 /// \param __b 3834 /// An 8-bit value used to initialize the elements of the destination integer 3835 /// vector. 3836 /// \returns An initialized 128-bit vector of [16 x i8] with all elements 3837 /// containing the value provided in the operand. 3838 static __inline__ __m128i __DEFAULT_FN_ATTRS 3839 _mm_set1_epi8(char __b) 3840 { 3841 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 3842 } 3843 3844 /// \brief Constructs a 128-bit integer vector, initialized in reverse order 3845 /// with the specified 64-bit integral values. 3846 /// 3847 /// \headerfile <x86intrin.h> 3848 /// 3849 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 3850 /// instruction. 3851 /// 3852 /// \param __q0 3853 /// A 64-bit integral value used to initialize the lower 64 bits of the 3854 /// result. 3855 /// \param __q1 3856 /// A 64-bit integral value used to initialize the upper 64 bits of the 3857 /// result. 3858 /// \returns An initialized 128-bit integer vector. 3859 static __inline__ __m128i __DEFAULT_FN_ATTRS 3860 _mm_setr_epi64(__m64 __q0, __m64 __q1) 3861 { 3862 return (__m128i){ (long long)__q0, (long long)__q1 }; 3863 } 3864 3865 /// \brief Constructs a 128-bit integer vector, initialized in reverse order 3866 /// with the specified 32-bit integral values. 3867 /// 3868 /// \headerfile <x86intrin.h> 3869 /// 3870 /// This intrinsic is a utility function and does not correspond to a specific 3871 /// instruction. 3872 /// 3873 /// \param __i0 3874 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3875 /// \param __i1 3876 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3877 /// \param __i2 3878 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3879 /// \param __i3 3880 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3881 /// \returns An initialized 128-bit integer vector. 3882 static __inline__ __m128i __DEFAULT_FN_ATTRS 3883 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 3884 { 3885 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 3886 } 3887 3888 /// \brief Constructs a 128-bit integer vector, initialized in reverse order 3889 /// with the specified 16-bit integral values. 3890 /// 3891 /// \headerfile <x86intrin.h> 3892 /// 3893 /// This intrinsic is a utility function and does not correspond to a specific 3894 /// instruction. 3895 /// 3896 /// \param __w0 3897 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3898 /// \param __w1 3899 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3900 /// \param __w2 3901 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3902 /// \param __w3 3903 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3904 /// \param __w4 3905 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3906 /// \param __w5 3907 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3908 /// \param __w6 3909 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3910 /// \param __w7 3911 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3912 /// \returns An initialized 128-bit integer vector. 3913 static __inline__ __m128i __DEFAULT_FN_ATTRS 3914 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 3915 { 3916 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 3917 } 3918 3919 /// \brief Constructs a 128-bit integer vector, initialized in reverse order 3920 /// with the specified 8-bit integral values. 3921 /// 3922 /// \headerfile <x86intrin.h> 3923 /// 3924 /// This intrinsic is a utility function and does not correspond to a specific 3925 /// instruction. 3926 /// 3927 /// \param __b0 3928 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3929 /// \param __b1 3930 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3931 /// \param __b2 3932 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3933 /// \param __b3 3934 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3935 /// \param __b4 3936 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3937 /// \param __b5 3938 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3939 /// \param __b6 3940 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3941 /// \param __b7 3942 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3943 /// \param __b8 3944 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3945 /// \param __b9 3946 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3947 /// \param __b10 3948 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3949 /// \param __b11 3950 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3951 /// \param __b12 3952 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3953 /// \param __b13 3954 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3955 /// \param __b14 3956 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3957 /// \param __b15 3958 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3959 /// \returns An initialized 128-bit integer vector. 3960 static __inline__ __m128i __DEFAULT_FN_ATTRS 3961 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 3962 { 3963 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 3964 } 3965 3966 /// \brief Creates a 128-bit integer vector initialized to zero. 3967 /// 3968 /// \headerfile <x86intrin.h> 3969 /// 3970 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 3971 /// 3972 /// \returns An initialized 128-bit integer vector with all elements set to 3973 /// zero. 3974 static __inline__ __m128i __DEFAULT_FN_ATTRS 3975 _mm_setzero_si128(void) 3976 { 3977 return (__m128i){ 0LL, 0LL }; 3978 } 3979 3980 /// \brief Stores a 128-bit integer vector to a memory location aligned on a 3981 /// 128-bit boundary. 3982 /// 3983 /// \headerfile <x86intrin.h> 3984 /// 3985 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 3986 /// 3987 /// \param __p 3988 /// A pointer to an aligned memory location that will receive the integer 3989 /// values. 3990 /// \param __b 3991 /// A 128-bit integer vector containing the values to be moved. 3992 static __inline__ void __DEFAULT_FN_ATTRS 3993 _mm_store_si128(__m128i *__p, __m128i __b) 3994 { 3995 *__p = __b; 3996 } 3997 3998 /// \brief Stores a 128-bit integer vector to an unaligned memory location. 3999 /// 4000 /// \headerfile <x86intrin.h> 4001 /// 4002 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 4003 /// 4004 /// \param __p 4005 /// A pointer to a memory location that will receive the integer values. 4006 /// \param __b 4007 /// A 128-bit integer vector containing the values to be moved. 4008 static __inline__ void __DEFAULT_FN_ATTRS 4009 _mm_storeu_si128(__m128i *__p, __m128i __b) 4010 { 4011 struct __storeu_si128 { 4012 __m128i __v; 4013 } __attribute__((__packed__, __may_alias__)); 4014 ((struct __storeu_si128*)__p)->__v = __b; 4015 } 4016 4017 /// \brief Moves bytes selected by the mask from the first operand to the 4018 /// specified unaligned memory location. When a mask bit is 1, the 4019 /// corresponding byte is written, otherwise it is not written. 4020 /// 4021 /// To minimize caching, the date is flagged as non-temporal (unlikely to be 4022 /// used again soon). Exception and trap behavior for elements not selected 4023 /// for storage to memory are implementation dependent. 4024 /// 4025 /// \headerfile <x86intrin.h> 4026 /// 4027 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c> 4028 /// instruction. 4029 /// 4030 /// \param __d 4031 /// A 128-bit integer vector containing the values to be moved. 4032 /// \param __n 4033 /// A 128-bit integer vector containing the mask. The most significant bit of 4034 /// each byte represents the mask bits. 4035 /// \param __p 4036 /// A pointer to an unaligned 128-bit memory location where the specified 4037 /// values are moved. 4038 static __inline__ void __DEFAULT_FN_ATTRS 4039 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 4040 { 4041 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 4042 } 4043 4044 /// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to 4045 /// a memory location. 4046 /// 4047 /// \headerfile <x86intrin.h> 4048 /// 4049 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 4050 /// 4051 /// \param __p 4052 /// A pointer to a 64-bit memory location that will receive the lower 64 bits 4053 /// of the integer vector parameter. 4054 /// \param __a 4055 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the 4056 /// value to be stored. 4057 static __inline__ void __DEFAULT_FN_ATTRS 4058 _mm_storel_epi64(__m128i *__p, __m128i __a) 4059 { 4060 struct __mm_storel_epi64_struct { 4061 long long __u; 4062 } __attribute__((__packed__, __may_alias__)); 4063 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 4064 } 4065 4066 /// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit 4067 /// aligned memory location. 4068 /// 4069 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4070 /// used again soon). 4071 /// 4072 /// \headerfile <x86intrin.h> 4073 /// 4074 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4075 /// 4076 /// \param __p 4077 /// A pointer to the 128-bit aligned memory location used to store the value. 4078 /// \param __a 4079 /// A vector of [2 x double] containing the 64-bit values to be stored. 4080 static __inline__ void __DEFAULT_FN_ATTRS 4081 _mm_stream_pd(double *__p, __m128d __a) 4082 { 4083 __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); 4084 } 4085 4086 /// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location. 4087 /// 4088 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4089 /// used again soon). 4090 /// 4091 /// \headerfile <x86intrin.h> 4092 /// 4093 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4094 /// 4095 /// \param __p 4096 /// A pointer to the 128-bit aligned memory location used to store the value. 4097 /// \param __a 4098 /// A 128-bit integer vector containing the values to be stored. 4099 static __inline__ void __DEFAULT_FN_ATTRS 4100 _mm_stream_si128(__m128i *__p, __m128i __a) 4101 { 4102 __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); 4103 } 4104 4105 /// \brief Stores a 32-bit integer value in the specified memory location. 4106 /// 4107 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4108 /// used again soon). 4109 /// 4110 /// \headerfile <x86intrin.h> 4111 /// 4112 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction. 4113 /// 4114 /// \param __p 4115 /// A pointer to the 32-bit memory location used to store the value. 4116 /// \param __a 4117 /// A 32-bit integer containing the value to be stored. 4118 static __inline__ void __DEFAULT_FN_ATTRS 4119 _mm_stream_si32(int *__p, int __a) 4120 { 4121 __builtin_ia32_movnti(__p, __a); 4122 } 4123 4124 #ifdef __x86_64__ 4125 /// \brief Stores a 64-bit integer value in the specified memory location. 4126 /// 4127 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4128 /// used again soon). 4129 /// 4130 /// \headerfile <x86intrin.h> 4131 /// 4132 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction. 4133 /// 4134 /// \param __p 4135 /// A pointer to the 64-bit memory location used to store the value. 4136 /// \param __a 4137 /// A 64-bit integer containing the value to be stored. 4138 static __inline__ void __DEFAULT_FN_ATTRS 4139 _mm_stream_si64(long long *__p, long long __a) 4140 { 4141 __builtin_ia32_movnti64(__p, __a); 4142 } 4143 #endif 4144 4145 #if defined(__cplusplus) 4146 extern "C" { 4147 #endif 4148 4149 /// \brief The cache line containing \a __p is flushed and invalidated from all 4150 /// caches in the coherency domain. 4151 /// 4152 /// \headerfile <x86intrin.h> 4153 /// 4154 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction. 4155 /// 4156 /// \param __p 4157 /// A pointer to the memory location used to identify the cache line to be 4158 /// flushed. 4159 void _mm_clflush(void const * __p); 4160 4161 /// \brief Forces strong memory ordering (serialization) between load 4162 /// instructions preceding this instruction and load instructions following 4163 /// this instruction, ensuring the system completes all previous loads before 4164 /// executing subsequent loads. 4165 /// 4166 /// \headerfile <x86intrin.h> 4167 /// 4168 /// This intrinsic corresponds to the <c> LFENCE </c> instruction. 4169 /// 4170 void _mm_lfence(void); 4171 4172 /// \brief Forces strong memory ordering (serialization) between load and store 4173 /// instructions preceding this instruction and load and store instructions 4174 /// following this instruction, ensuring that the system completes all 4175 /// previous memory accesses before executing subsequent memory accesses. 4176 /// 4177 /// \headerfile <x86intrin.h> 4178 /// 4179 /// This intrinsic corresponds to the <c> MFENCE </c> instruction. 4180 /// 4181 void _mm_mfence(void); 4182 4183 #if defined(__cplusplus) 4184 } // extern "C" 4185 #endif 4186 4187 /// \brief Converts 16-bit signed integers from both 128-bit integer vector 4188 /// operands into 8-bit signed integers, and packs the results into the 4189 /// destination. Positive values greater than 0x7F are saturated to 0x7F. 4190 /// Negative values less than 0x80 are saturated to 0x80. 4191 /// 4192 /// \headerfile <x86intrin.h> 4193 /// 4194 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction. 4195 /// 4196 /// \param __a 4197 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4198 /// a signed integer and is converted to a 8-bit signed integer with 4199 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4200 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4201 /// written to the lower 64 bits of the result. 4202 /// \param __b 4203 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4204 /// a signed integer and is converted to a 8-bit signed integer with 4205 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4206 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4207 /// written to the higher 64 bits of the result. 4208 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4209 static __inline__ __m128i __DEFAULT_FN_ATTRS 4210 _mm_packs_epi16(__m128i __a, __m128i __b) 4211 { 4212 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 4213 } 4214 4215 /// \brief Converts 32-bit signed integers from both 128-bit integer vector 4216 /// operands into 16-bit signed integers, and packs the results into the 4217 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. 4218 /// Negative values less than 0x8000 are saturated to 0x8000. 4219 /// 4220 /// \headerfile <x86intrin.h> 4221 /// 4222 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction. 4223 /// 4224 /// \param __a 4225 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4226 /// a signed integer and is converted to a 16-bit signed integer with 4227 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4228 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4229 /// are written to the lower 64 bits of the result. 4230 /// \param __b 4231 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4232 /// a signed integer and is converted to a 16-bit signed integer with 4233 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4234 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4235 /// are written to the higher 64 bits of the result. 4236 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 4237 static __inline__ __m128i __DEFAULT_FN_ATTRS 4238 _mm_packs_epi32(__m128i __a, __m128i __b) 4239 { 4240 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 4241 } 4242 4243 /// \brief Converts 16-bit signed integers from both 128-bit integer vector 4244 /// operands into 8-bit unsigned integers, and packs the results into the 4245 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less 4246 /// than 0x00 are saturated to 0x00. 4247 /// 4248 /// \headerfile <x86intrin.h> 4249 /// 4250 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction. 4251 /// 4252 /// \param __a 4253 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4254 /// a signed integer and is converted to an 8-bit unsigned integer with 4255 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4256 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4257 /// written to the lower 64 bits of the result. 4258 /// \param __b 4259 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4260 /// a signed integer and is converted to an 8-bit unsigned integer with 4261 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4262 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4263 /// written to the higher 64 bits of the result. 4264 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4265 static __inline__ __m128i __DEFAULT_FN_ATTRS 4266 _mm_packus_epi16(__m128i __a, __m128i __b) 4267 { 4268 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 4269 } 4270 4271 /// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using 4272 /// the immediate-value parameter as a selector. 4273 /// 4274 /// \headerfile <x86intrin.h> 4275 /// 4276 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 4277 /// 4278 /// \param __a 4279 /// A 128-bit integer vector. 4280 /// \param __imm 4281 /// An immediate value. Bits [2:0] selects values from \a __a to be assigned 4282 /// to bits[15:0] of the result. \n 4283 /// 000: assign values from bits [15:0] of \a __a. \n 4284 /// 001: assign values from bits [31:16] of \a __a. \n 4285 /// 010: assign values from bits [47:32] of \a __a. \n 4286 /// 011: assign values from bits [63:48] of \a __a. \n 4287 /// 100: assign values from bits [79:64] of \a __a. \n 4288 /// 101: assign values from bits [95:80] of \a __a. \n 4289 /// 110: assign values from bits [111:96] of \a __a. \n 4290 /// 111: assign values from bits [127:112] of \a __a. 4291 /// \returns An integer, whose lower 16 bits are selected from the 128-bit 4292 /// integer vector parameter and the remaining bits are assigned zeros. 4293 static __inline__ int __DEFAULT_FN_ATTRS 4294 _mm_extract_epi16(__m128i __a, int __imm) 4295 { 4296 __v8hi __b = (__v8hi)__a; 4297 return (unsigned short)__b[__imm & 7]; 4298 } 4299 4300 /// \brief Constructs a 128-bit integer vector by first making a copy of the 4301 /// 128-bit integer vector parameter, and then inserting the lower 16 bits 4302 /// of an integer parameter into an offset specified by the immediate-value 4303 /// parameter. 4304 /// 4305 /// \headerfile <x86intrin.h> 4306 /// 4307 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. 4308 /// 4309 /// \param __a 4310 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the 4311 /// result and then one of the eight elements in the result is replaced by 4312 /// the lower 16 bits of \a __b. 4313 /// \param __b 4314 /// An integer. The lower 16 bits of this parameter are written to the 4315 /// result beginning at an offset specified by \a __imm. 4316 /// \param __imm 4317 /// An immediate value specifying the bit offset in the result at which the 4318 /// lower 16 bits of \a __b are written. 4319 /// \returns A 128-bit integer vector containing the constructed values. 4320 static __inline__ __m128i __DEFAULT_FN_ATTRS 4321 _mm_insert_epi16(__m128i __a, int __b, int __imm) 4322 { 4323 __v8hi __c = (__v8hi)__a; 4324 __c[__imm & 7] = __b; 4325 return (__m128i)__c; 4326 } 4327 4328 /// \brief Copies the values of the most significant bits from each 8-bit 4329 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask 4330 /// value, zero-extends the value, and writes it to the destination. 4331 /// 4332 /// \headerfile <x86intrin.h> 4333 /// 4334 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction. 4335 /// 4336 /// \param __a 4337 /// A 128-bit integer vector containing the values with bits to be extracted. 4338 /// \returns The most significant bits from each 8-bit element in \a __a, 4339 /// written to bits [15:0]. The other bits are assigned zeros. 4340 static __inline__ int __DEFAULT_FN_ATTRS 4341 _mm_movemask_epi8(__m128i __a) 4342 { 4343 return __builtin_ia32_pmovmskb128((__v16qi)__a); 4344 } 4345 4346 /// \brief Constructs a 128-bit integer vector by shuffling four 32-bit 4347 /// elements of a 128-bit integer vector parameter, using the immediate-value 4348 /// parameter as a specifier. 4349 /// 4350 /// \headerfile <x86intrin.h> 4351 /// 4352 /// \code 4353 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm); 4354 /// \endcode 4355 /// 4356 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. 4357 /// 4358 /// \param a 4359 /// A 128-bit integer vector containing the values to be copied. 4360 /// \param imm 4361 /// An immediate value containing an 8-bit value specifying which elements to 4362 /// copy from a. The destinations within the 128-bit destination are assigned 4363 /// values as follows: \n 4364 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n 4365 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n 4366 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n 4367 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n 4368 /// Bit value assignments: \n 4369 /// 00: assign values from bits [31:0] of \a a. \n 4370 /// 01: assign values from bits [63:32] of \a a. \n 4371 /// 10: assign values from bits [95:64] of \a a. \n 4372 /// 11: assign values from bits [127:96] of \a a. 4373 /// \returns A 128-bit integer vector containing the shuffled values. 4374 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 4375 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ 4376 (__v4si)_mm_undefined_si128(), \ 4377 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ 4378 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); }) 4379 4380 /// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit 4381 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4382 /// value parameter as a specifier. 4383 /// 4384 /// \headerfile <x86intrin.h> 4385 /// 4386 /// \code 4387 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); 4388 /// \endcode 4389 /// 4390 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. 4391 /// 4392 /// \param a 4393 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits 4394 /// [127:64] of the result. 4395 /// \param imm 4396 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4397 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n 4398 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n 4399 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n 4400 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n 4401 /// Bit value assignments: \n 4402 /// 00: assign values from bits [15:0] of \a a. \n 4403 /// 01: assign values from bits [31:16] of \a a. \n 4404 /// 10: assign values from bits [47:32] of \a a. \n 4405 /// 11: assign values from bits [63:48] of \a a. \n 4406 /// \returns A 128-bit integer vector containing the shuffled values. 4407 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 4408 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 4409 (__v8hi)_mm_undefined_si128(), \ 4410 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ 4411 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \ 4412 4, 5, 6, 7); }) 4413 4414 /// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit 4415 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4416 /// value parameter as a specifier. 4417 /// 4418 /// \headerfile <x86intrin.h> 4419 /// 4420 /// \code 4421 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); 4422 /// \endcode 4423 /// 4424 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction. 4425 /// 4426 /// \param a 4427 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits 4428 /// [63:0] of the result. 4429 /// \param imm 4430 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4431 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n 4432 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n 4433 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n 4434 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n 4435 /// Bit value assignments: \n 4436 /// 00: assign values from bits [79:64] of \a a. \n 4437 /// 01: assign values from bits [95:80] of \a a. \n 4438 /// 10: assign values from bits [111:96] of \a a. \n 4439 /// 11: assign values from bits [127:112] of \a a. \n 4440 /// \returns A 128-bit integer vector containing the shuffled values. 4441 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 4442 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 4443 (__v8hi)_mm_undefined_si128(), \ 4444 0, 1, 2, 3, \ 4445 4 + (((imm) >> 0) & 0x3), \ 4446 4 + (((imm) >> 2) & 0x3), \ 4447 4 + (((imm) >> 4) & 0x3), \ 4448 4 + (((imm) >> 6) & 0x3)); }) 4449 4450 /// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors 4451 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4452 /// 4453 /// \headerfile <x86intrin.h> 4454 /// 4455 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c> 4456 /// instruction. 4457 /// 4458 /// \param __a 4459 /// A 128-bit vector of [16 x i8]. 4460 /// Bits [71:64] are written to bits [7:0] of the result. \n 4461 /// Bits [79:72] are written to bits [23:16] of the result. \n 4462 /// Bits [87:80] are written to bits [39:32] of the result. \n 4463 /// Bits [95:88] are written to bits [55:48] of the result. \n 4464 /// Bits [103:96] are written to bits [71:64] of the result. \n 4465 /// Bits [111:104] are written to bits [87:80] of the result. \n 4466 /// Bits [119:112] are written to bits [103:96] of the result. \n 4467 /// Bits [127:120] are written to bits [119:112] of the result. 4468 /// \param __b 4469 /// A 128-bit vector of [16 x i8]. \n 4470 /// Bits [71:64] are written to bits [15:8] of the result. \n 4471 /// Bits [79:72] are written to bits [31:24] of the result. \n 4472 /// Bits [87:80] are written to bits [47:40] of the result. \n 4473 /// Bits [95:88] are written to bits [63:56] of the result. \n 4474 /// Bits [103:96] are written to bits [79:72] of the result. \n 4475 /// Bits [111:104] are written to bits [95:88] of the result. \n 4476 /// Bits [119:112] are written to bits [111:104] of the result. \n 4477 /// Bits [127:120] are written to bits [127:120] of the result. 4478 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4479 static __inline__ __m128i __DEFAULT_FN_ATTRS 4480 _mm_unpackhi_epi8(__m128i __a, __m128i __b) 4481 { 4482 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 4483 } 4484 4485 /// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of 4486 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. 4487 /// 4488 /// \headerfile <x86intrin.h> 4489 /// 4490 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c> 4491 /// instruction. 4492 /// 4493 /// \param __a 4494 /// A 128-bit vector of [8 x i16]. 4495 /// Bits [79:64] are written to bits [15:0] of the result. \n 4496 /// Bits [95:80] are written to bits [47:32] of the result. \n 4497 /// Bits [111:96] are written to bits [79:64] of the result. \n 4498 /// Bits [127:112] are written to bits [111:96] of the result. 4499 /// \param __b 4500 /// A 128-bit vector of [8 x i16]. 4501 /// Bits [79:64] are written to bits [31:16] of the result. \n 4502 /// Bits [95:80] are written to bits [63:48] of the result. \n 4503 /// Bits [111:96] are written to bits [95:80] of the result. \n 4504 /// Bits [127:112] are written to bits [127:112] of the result. 4505 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4506 static __inline__ __m128i __DEFAULT_FN_ATTRS 4507 _mm_unpackhi_epi16(__m128i __a, __m128i __b) 4508 { 4509 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 4510 } 4511 4512 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of 4513 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4514 /// 4515 /// \headerfile <x86intrin.h> 4516 /// 4517 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c> 4518 /// instruction. 4519 /// 4520 /// \param __a 4521 /// A 128-bit vector of [4 x i32]. \n 4522 /// Bits [95:64] are written to bits [31:0] of the destination. \n 4523 /// Bits [127:96] are written to bits [95:64] of the destination. 4524 /// \param __b 4525 /// A 128-bit vector of [4 x i32]. \n 4526 /// Bits [95:64] are written to bits [64:32] of the destination. \n 4527 /// Bits [127:96] are written to bits [127:96] of the destination. 4528 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4529 static __inline__ __m128i __DEFAULT_FN_ATTRS 4530 _mm_unpackhi_epi32(__m128i __a, __m128i __b) 4531 { 4532 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 4533 } 4534 4535 /// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors 4536 /// of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4537 /// 4538 /// \headerfile <x86intrin.h> 4539 /// 4540 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c> 4541 /// instruction. 4542 /// 4543 /// \param __a 4544 /// A 128-bit vector of [2 x i64]. \n 4545 /// Bits [127:64] are written to bits [63:0] of the destination. 4546 /// \param __b 4547 /// A 128-bit vector of [2 x i64]. \n 4548 /// Bits [127:64] are written to bits [127:64] of the destination. 4549 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4550 static __inline__ __m128i __DEFAULT_FN_ATTRS 4551 _mm_unpackhi_epi64(__m128i __a, __m128i __b) 4552 { 4553 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); 4554 } 4555 4556 /// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of 4557 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4558 /// 4559 /// \headerfile <x86intrin.h> 4560 /// 4561 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c> 4562 /// instruction. 4563 /// 4564 /// \param __a 4565 /// A 128-bit vector of [16 x i8]. \n 4566 /// Bits [7:0] are written to bits [7:0] of the result. \n 4567 /// Bits [15:8] are written to bits [23:16] of the result. \n 4568 /// Bits [23:16] are written to bits [39:32] of the result. \n 4569 /// Bits [31:24] are written to bits [55:48] of the result. \n 4570 /// Bits [39:32] are written to bits [71:64] of the result. \n 4571 /// Bits [47:40] are written to bits [87:80] of the result. \n 4572 /// Bits [55:48] are written to bits [103:96] of the result. \n 4573 /// Bits [63:56] are written to bits [119:112] of the result. 4574 /// \param __b 4575 /// A 128-bit vector of [16 x i8]. 4576 /// Bits [7:0] are written to bits [15:8] of the result. \n 4577 /// Bits [15:8] are written to bits [31:24] of the result. \n 4578 /// Bits [23:16] are written to bits [47:40] of the result. \n 4579 /// Bits [31:24] are written to bits [63:56] of the result. \n 4580 /// Bits [39:32] are written to bits [79:72] of the result. \n 4581 /// Bits [47:40] are written to bits [95:88] of the result. \n 4582 /// Bits [55:48] are written to bits [111:104] of the result. \n 4583 /// Bits [63:56] are written to bits [127:120] of the result. 4584 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4585 static __inline__ __m128i __DEFAULT_FN_ATTRS 4586 _mm_unpacklo_epi8(__m128i __a, __m128i __b) 4587 { 4588 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 4589 } 4590 4591 /// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit 4592 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of 4593 /// [8 x i16]. 4594 /// 4595 /// \headerfile <x86intrin.h> 4596 /// 4597 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c> 4598 /// instruction. 4599 /// 4600 /// \param __a 4601 /// A 128-bit vector of [8 x i16]. 4602 /// Bits [15:0] are written to bits [15:0] of the result. \n 4603 /// Bits [31:16] are written to bits [47:32] of the result. \n 4604 /// Bits [47:32] are written to bits [79:64] of the result. \n 4605 /// Bits [63:48] are written to bits [111:96] of the result. 4606 /// \param __b 4607 /// A 128-bit vector of [8 x i16]. 4608 /// Bits [15:0] are written to bits [31:16] of the result. \n 4609 /// Bits [31:16] are written to bits [63:48] of the result. \n 4610 /// Bits [47:32] are written to bits [95:80] of the result. \n 4611 /// Bits [63:48] are written to bits [127:112] of the result. 4612 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4613 static __inline__ __m128i __DEFAULT_FN_ATTRS 4614 _mm_unpacklo_epi16(__m128i __a, __m128i __b) 4615 { 4616 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 4617 } 4618 4619 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of 4620 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4621 /// 4622 /// \headerfile <x86intrin.h> 4623 /// 4624 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c> 4625 /// instruction. 4626 /// 4627 /// \param __a 4628 /// A 128-bit vector of [4 x i32]. \n 4629 /// Bits [31:0] are written to bits [31:0] of the destination. \n 4630 /// Bits [63:32] are written to bits [95:64] of the destination. 4631 /// \param __b 4632 /// A 128-bit vector of [4 x i32]. \n 4633 /// Bits [31:0] are written to bits [64:32] of the destination. \n 4634 /// Bits [63:32] are written to bits [127:96] of the destination. 4635 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4636 static __inline__ __m128i __DEFAULT_FN_ATTRS 4637 _mm_unpacklo_epi32(__m128i __a, __m128i __b) 4638 { 4639 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 4640 } 4641 4642 /// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of 4643 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4644 /// 4645 /// \headerfile <x86intrin.h> 4646 /// 4647 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 4648 /// instruction. 4649 /// 4650 /// \param __a 4651 /// A 128-bit vector of [2 x i64]. \n 4652 /// Bits [63:0] are written to bits [63:0] of the destination. \n 4653 /// \param __b 4654 /// A 128-bit vector of [2 x i64]. \n 4655 /// Bits [63:0] are written to bits [127:64] of the destination. \n 4656 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4657 static __inline__ __m128i __DEFAULT_FN_ATTRS 4658 _mm_unpacklo_epi64(__m128i __a, __m128i __b) 4659 { 4660 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); 4661 } 4662 4663 /// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit 4664 /// integer. 4665 /// 4666 /// \headerfile <x86intrin.h> 4667 /// 4668 /// This intrinsic has no corresponding instruction. 4669 /// 4670 /// \param __a 4671 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4672 /// destination. 4673 /// \returns A 64-bit integer containing the lower 64 bits of the parameter. 4674 static __inline__ __m64 __DEFAULT_FN_ATTRS 4675 _mm_movepi64_pi64(__m128i __a) 4676 { 4677 return (__m64)__a[0]; 4678 } 4679 4680 /// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the 4681 /// upper bits. 4682 /// 4683 /// \headerfile <x86intrin.h> 4684 /// 4685 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction. 4686 /// 4687 /// \param __a 4688 /// A 64-bit value. 4689 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4690 /// the operand. The upper 64 bits are assigned zeros. 4691 static __inline__ __m128i __DEFAULT_FN_ATTRS 4692 _mm_movpi64_epi64(__m64 __a) 4693 { 4694 return (__m128i){ (long long)__a, 0 }; 4695 } 4696 4697 /// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit 4698 /// integer vector, zeroing the upper bits. 4699 /// 4700 /// \headerfile <x86intrin.h> 4701 /// 4702 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4703 /// 4704 /// \param __a 4705 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4706 /// destination. 4707 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4708 /// the operand. The upper 64 bits are assigned zeros. 4709 static __inline__ __m128i __DEFAULT_FN_ATTRS 4710 _mm_move_epi64(__m128i __a) 4711 { 4712 return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2); 4713 } 4714 4715 /// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors 4716 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4717 /// double]. 4718 /// 4719 /// \headerfile <x86intrin.h> 4720 /// 4721 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 4722 /// 4723 /// \param __a 4724 /// A 128-bit vector of [2 x double]. \n 4725 /// Bits [127:64] are written to bits [63:0] of the destination. 4726 /// \param __b 4727 /// A 128-bit vector of [2 x double]. \n 4728 /// Bits [127:64] are written to bits [127:64] of the destination. 4729 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4730 static __inline__ __m128d __DEFAULT_FN_ATTRS 4731 _mm_unpackhi_pd(__m128d __a, __m128d __b) 4732 { 4733 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); 4734 } 4735 4736 /// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors 4737 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4738 /// double]. 4739 /// 4740 /// \headerfile <x86intrin.h> 4741 /// 4742 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 4743 /// 4744 /// \param __a 4745 /// A 128-bit vector of [2 x double]. \n 4746 /// Bits [63:0] are written to bits [63:0] of the destination. 4747 /// \param __b 4748 /// A 128-bit vector of [2 x double]. \n 4749 /// Bits [63:0] are written to bits [127:64] of the destination. 4750 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4751 static __inline__ __m128d __DEFAULT_FN_ATTRS 4752 _mm_unpacklo_pd(__m128d __a, __m128d __b) 4753 { 4754 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); 4755 } 4756 4757 /// \brief Extracts the sign bits of the double-precision values in the 128-bit 4758 /// vector of [2 x double], zero-extends the value, and writes it to the 4759 /// low-order bits of the destination. 4760 /// 4761 /// \headerfile <x86intrin.h> 4762 /// 4763 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction. 4764 /// 4765 /// \param __a 4766 /// A 128-bit vector of [2 x double] containing the values with sign bits to 4767 /// be extracted. 4768 /// \returns The sign bits from each of the double-precision elements in \a __a, 4769 /// written to bits [1:0]. The remaining bits are assigned values of zero. 4770 static __inline__ int __DEFAULT_FN_ATTRS 4771 _mm_movemask_pd(__m128d __a) 4772 { 4773 return __builtin_ia32_movmskpd((__v2df)__a); 4774 } 4775 4776 4777 /// \brief Constructs a 128-bit floating-point vector of [2 x double] from two 4778 /// 128-bit vector parameters of [2 x double], using the immediate-value 4779 /// parameter as a specifier. 4780 /// 4781 /// \headerfile <x86intrin.h> 4782 /// 4783 /// \code 4784 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); 4785 /// \endcode 4786 /// 4787 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction. 4788 /// 4789 /// \param a 4790 /// A 128-bit vector of [2 x double]. 4791 /// \param b 4792 /// A 128-bit vector of [2 x double]. 4793 /// \param i 4794 /// An 8-bit immediate value. The least significant two bits specify which 4795 /// elements to copy from a and b: \n 4796 /// Bit[0] = 0: lower element of a copied to lower element of result. \n 4797 /// Bit[0] = 1: upper element of a copied to lower element of result. \n 4798 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n 4799 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n 4800 /// \returns A 128-bit vector of [2 x double] containing the shuffled values. 4801 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 4802 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4803 0 + (((i) >> 0) & 0x1), \ 4804 2 + (((i) >> 1) & 0x1)); }) 4805 4806 /// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4807 /// floating-point vector of [4 x float]. 4808 /// 4809 /// \headerfile <x86intrin.h> 4810 /// 4811 /// This intrinsic has no corresponding instruction. 4812 /// 4813 /// \param __a 4814 /// A 128-bit floating-point vector of [2 x double]. 4815 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4816 /// bitwise pattern as the parameter. 4817 static __inline__ __m128 __DEFAULT_FN_ATTRS 4818 _mm_castpd_ps(__m128d __a) 4819 { 4820 return (__m128)__a; 4821 } 4822 4823 /// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4824 /// integer vector. 4825 /// 4826 /// \headerfile <x86intrin.h> 4827 /// 4828 /// This intrinsic has no corresponding instruction. 4829 /// 4830 /// \param __a 4831 /// A 128-bit floating-point vector of [2 x double]. 4832 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4833 /// parameter. 4834 static __inline__ __m128i __DEFAULT_FN_ATTRS 4835 _mm_castpd_si128(__m128d __a) 4836 { 4837 return (__m128i)__a; 4838 } 4839 4840 /// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4841 /// floating-point vector of [2 x double]. 4842 /// 4843 /// \headerfile <x86intrin.h> 4844 /// 4845 /// This intrinsic has no corresponding instruction. 4846 /// 4847 /// \param __a 4848 /// A 128-bit floating-point vector of [4 x float]. 4849 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4850 /// bitwise pattern as the parameter. 4851 static __inline__ __m128d __DEFAULT_FN_ATTRS 4852 _mm_castps_pd(__m128 __a) 4853 { 4854 return (__m128d)__a; 4855 } 4856 4857 /// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4858 /// integer vector. 4859 /// 4860 /// \headerfile <x86intrin.h> 4861 /// 4862 /// This intrinsic has no corresponding instruction. 4863 /// 4864 /// \param __a 4865 /// A 128-bit floating-point vector of [4 x float]. 4866 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4867 /// parameter. 4868 static __inline__ __m128i __DEFAULT_FN_ATTRS 4869 _mm_castps_si128(__m128 __a) 4870 { 4871 return (__m128i)__a; 4872 } 4873 4874 /// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector 4875 /// of [4 x float]. 4876 /// 4877 /// \headerfile <x86intrin.h> 4878 /// 4879 /// This intrinsic has no corresponding instruction. 4880 /// 4881 /// \param __a 4882 /// A 128-bit integer vector. 4883 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4884 /// bitwise pattern as the parameter. 4885 static __inline__ __m128 __DEFAULT_FN_ATTRS 4886 _mm_castsi128_ps(__m128i __a) 4887 { 4888 return (__m128)__a; 4889 } 4890 4891 /// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector 4892 /// of [2 x double]. 4893 /// 4894 /// \headerfile <x86intrin.h> 4895 /// 4896 /// This intrinsic has no corresponding instruction. 4897 /// 4898 /// \param __a 4899 /// A 128-bit integer vector. 4900 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4901 /// bitwise pattern as the parameter. 4902 static __inline__ __m128d __DEFAULT_FN_ATTRS 4903 _mm_castsi128_pd(__m128i __a) 4904 { 4905 return (__m128d)__a; 4906 } 4907 4908 #if defined(__cplusplus) 4909 extern "C" { 4910 #endif 4911 4912 /// \brief Indicates that a spin loop is being executed for the purposes of 4913 /// optimizing power consumption during the loop. 4914 /// 4915 /// \headerfile <x86intrin.h> 4916 /// 4917 /// This intrinsic corresponds to the <c> PAUSE </c> instruction. 4918 /// 4919 void _mm_pause(void); 4920 4921 #if defined(__cplusplus) 4922 } // extern "C" 4923 #endif 4924 #undef __DEFAULT_FN_ATTRS 4925 4926 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 4927 4928 #define _MM_DENORMALS_ZERO_ON (0x0040) 4929 #define _MM_DENORMALS_ZERO_OFF (0x0000) 4930 4931 #define _MM_DENORMALS_ZERO_MASK (0x0040) 4932 4933 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 4934 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 4935 4936 #endif /* __EMMINTRIN_H */ 4937