1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __XMMINTRIN_H 25 #define __XMMINTRIN_H 26 27 #include <mmintrin.h> 28 29 typedef int __v4si __attribute__((__vector_size__(16))); 30 typedef float __v4sf __attribute__((__vector_size__(16))); 31 typedef float __m128 __attribute__((__vector_size__(16))); 32 33 /* Unsigned types */ 34 typedef unsigned int __v4su __attribute__((__vector_size__(16))); 35 36 /* This header should only be included in a hosted environment as it depends on 37 * a standard library to provide allocation routines. */ 38 #if __STDC_HOSTED__ 39 #include <mm_malloc.h> 40 #endif 41 42 /* Define the default attributes for the functions in this file. */ 43 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"))) 44 45 /// \brief Adds the 32-bit float values in the low-order bits of the operands. 46 /// 47 /// \headerfile <x86intrin.h> 48 /// 49 /// This intrinsic corresponds to the \c VADDSS / ADDSS instructions. 50 /// 51 /// \param __a 52 /// A 128-bit vector of [4 x float] containing one of the source operands. 53 /// The lower 32 bits of this operand are used in the calculation. 54 /// \param __b 55 /// A 128-bit vector of [4 x float] containing one of the source operands. 56 /// The lower 32 bits of this operand are used in the calculation. 57 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum 58 /// of the lower 32 bits of both operands. The upper 96 bits are copied from 59 /// the upper 96 bits of the first source operand. 60 static __inline__ __m128 __DEFAULT_FN_ATTRS 61 _mm_add_ss(__m128 __a, __m128 __b) 62 { 63 __a[0] += __b[0]; 64 return __a; 65 } 66 67 /// \brief Adds two 128-bit vectors of [4 x float], and returns the results of 68 /// the addition. 69 /// 70 /// \headerfile <x86intrin.h> 71 /// 72 /// This intrinsic corresponds to the \c VADDPS / ADDPS instructions. 73 /// 74 /// \param __a 75 /// A 128-bit vector of [4 x float] containing one of the source operands. 76 /// \param __b 77 /// A 128-bit vector of [4 x float] containing one of the source operands. 78 /// \returns A 128-bit vector of [4 x float] containing the sums of both 79 /// operands. 80 static __inline__ __m128 __DEFAULT_FN_ATTRS 81 _mm_add_ps(__m128 __a, __m128 __b) 82 { 83 return (__m128)((__v4sf)__a + (__v4sf)__b); 84 } 85 86 /// \brief Subtracts the 32-bit float value in the low-order bits of the second 87 /// operand from the corresponding value in the first operand. 88 /// 89 /// \headerfile <x86intrin.h> 90 /// 91 /// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions. 92 /// 93 /// \param __a 94 /// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits 95 /// of this operand are used in the calculation. 96 /// \param __b 97 /// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32 98 /// bits of this operand are used in the calculation. 99 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 100 /// difference of the lower 32 bits of both operands. The upper 96 bits are 101 /// copied from the upper 96 bits of the first source operand. 102 static __inline__ __m128 __DEFAULT_FN_ATTRS 103 _mm_sub_ss(__m128 __a, __m128 __b) 104 { 105 __a[0] -= __b[0]; 106 return __a; 107 } 108 109 /// \brief Subtracts each of the values of the second operand from the first 110 /// operand, both of which are 128-bit vectors of [4 x float] and returns 111 /// the results of the subtraction. 112 /// 113 /// \headerfile <x86intrin.h> 114 /// 115 /// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions. 116 /// 117 /// \param __a 118 /// A 128-bit vector of [4 x float] containing the minuend. 119 /// \param __b 120 /// A 128-bit vector of [4 x float] containing the subtrahend. 121 /// \returns A 128-bit vector of [4 x float] containing the differences between 122 /// both operands. 123 static __inline__ __m128 __DEFAULT_FN_ATTRS 124 _mm_sub_ps(__m128 __a, __m128 __b) 125 { 126 return (__m128)((__v4sf)__a - (__v4sf)__b); 127 } 128 129 /// \brief Multiplies two 32-bit float values in the low-order bits of the 130 /// operands. 131 /// 132 /// \headerfile <x86intrin.h> 133 /// 134 /// This intrinsic corresponds to the \c VMULSS / MULSS instructions. 135 /// 136 /// \param __a 137 /// A 128-bit vector of [4 x float] containing one of the source operands. 138 /// The lower 32 bits of this operand are used in the calculation. 139 /// \param __b 140 /// A 128-bit vector of [4 x float] containing one of the source operands. 141 /// The lower 32 bits of this operand are used in the calculation. 142 /// \returns A 128-bit vector of [4 x float] containing the product of the lower 143 /// 32 bits of both operands. The upper 96 bits are copied from the upper 96 144 /// bits of the first source operand. 145 static __inline__ __m128 __DEFAULT_FN_ATTRS 146 _mm_mul_ss(__m128 __a, __m128 __b) 147 { 148 __a[0] *= __b[0]; 149 return __a; 150 } 151 152 /// \brief Multiplies two 128-bit vectors of [4 x float] and returns the 153 /// results of the multiplication. 154 /// 155 /// \headerfile <x86intrin.h> 156 /// 157 /// This intrinsic corresponds to the \c VMULPS / MULPS instructions. 158 /// 159 /// \param __a 160 /// A 128-bit vector of [4 x float] containing one of the source operands. 161 /// \param __b 162 /// A 128-bit vector of [4 x float] containing one of the source operands. 163 /// \returns A 128-bit vector of [4 x float] containing the products of both 164 /// operands. 165 static __inline__ __m128 __DEFAULT_FN_ATTRS 166 _mm_mul_ps(__m128 __a, __m128 __b) 167 { 168 return (__m128)((__v4sf)__a * (__v4sf)__b); 169 } 170 171 /// \brief Divides the value in the low-order 32 bits of the first operand by 172 /// the corresponding value in the second operand. 173 /// 174 /// \headerfile <x86intrin.h> 175 /// 176 /// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions. 177 /// 178 /// \param __a 179 /// A 128-bit vector of [4 x float] containing the dividend. The lower 32 180 /// bits of this operand are used in the calculation. 181 /// \param __b 182 /// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits 183 /// of this operand are used in the calculation. 184 /// \returns A 128-bit vector of [4 x float] containing the quotients of the 185 /// lower 32 bits of both operands. The upper 96 bits are copied from the 186 /// upper 96 bits of the first source operand. 187 static __inline__ __m128 __DEFAULT_FN_ATTRS 188 _mm_div_ss(__m128 __a, __m128 __b) 189 { 190 __a[0] /= __b[0]; 191 return __a; 192 } 193 194 /// \brief Divides two 128-bit vectors of [4 x float]. 195 /// 196 /// \headerfile <x86intrin.h> 197 /// 198 /// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions. 199 /// 200 /// \param __a 201 /// A 128-bit vector of [4 x float] containing the dividend. 202 /// \param __b 203 /// A 128-bit vector of [4 x float] containing the divisor. 204 /// \returns A 128-bit vector of [4 x float] containing the quotients of both 205 /// operands. 206 static __inline__ __m128 __DEFAULT_FN_ATTRS 207 _mm_div_ps(__m128 __a, __m128 __b) 208 { 209 return (__m128)((__v4sf)__a / (__v4sf)__b); 210 } 211 212 /// \brief Calculates the square root of the value stored in the low-order bits 213 /// of a 128-bit vector of [4 x float]. 214 /// 215 /// \headerfile <x86intrin.h> 216 /// 217 /// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions. 218 /// 219 /// \param __a 220 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 221 /// used in the calculation. 222 /// \returns A 128-bit vector of [4 x float] containing the square root of the 223 /// value in the low-order bits of the operand. 224 static __inline__ __m128 __DEFAULT_FN_ATTRS 225 _mm_sqrt_ss(__m128 __a) 226 { 227 __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a); 228 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 229 } 230 231 /// \brief Calculates the square roots of the values stored in a 128-bit vector 232 /// of [4 x float]. 233 /// 234 /// \headerfile <x86intrin.h> 235 /// 236 /// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions. 237 /// 238 /// \param __a 239 /// A 128-bit vector of [4 x float]. 240 /// \returns A 128-bit vector of [4 x float] containing the square roots of the 241 /// values in the operand. 242 static __inline__ __m128 __DEFAULT_FN_ATTRS 243 _mm_sqrt_ps(__m128 __a) 244 { 245 return __builtin_ia32_sqrtps((__v4sf)__a); 246 } 247 248 /// \brief Calculates the approximate reciprocal of the value stored in the 249 /// low-order bits of a 128-bit vector of [4 x float]. 250 /// 251 /// \headerfile <x86intrin.h> 252 /// 253 /// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions. 254 /// 255 /// \param __a 256 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 257 /// used in the calculation. 258 /// \returns A 128-bit vector of [4 x float] containing the approximate 259 /// reciprocal of the value in the low-order bits of the operand. 260 static __inline__ __m128 __DEFAULT_FN_ATTRS 261 _mm_rcp_ss(__m128 __a) 262 { 263 __m128 __c = __builtin_ia32_rcpss((__v4sf)__a); 264 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 265 } 266 267 /// \brief Calculates the approximate reciprocals of the values stored in a 268 /// 128-bit vector of [4 x float]. 269 /// 270 /// \headerfile <x86intrin.h> 271 /// 272 /// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions. 273 /// 274 /// \param __a 275 /// A 128-bit vector of [4 x float]. 276 /// \returns A 128-bit vector of [4 x float] containing the approximate 277 /// reciprocals of the values in the operand. 278 static __inline__ __m128 __DEFAULT_FN_ATTRS 279 _mm_rcp_ps(__m128 __a) 280 { 281 return __builtin_ia32_rcpps((__v4sf)__a); 282 } 283 284 /// \brief Calculates the approximate reciprocal of the square root of the value 285 /// stored in the low-order bits of a 128-bit vector of [4 x float]. 286 /// 287 /// \headerfile <x86intrin.h> 288 /// 289 /// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions. 290 /// 291 /// \param __a 292 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 293 /// used in the calculation. 294 /// \returns A 128-bit vector of [4 x float] containing the approximate 295 /// reciprocal of the square root of the value in the low-order bits of the 296 /// operand. 297 static __inline__ __m128 __DEFAULT_FN_ATTRS 298 _mm_rsqrt_ss(__m128 __a) 299 { 300 __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a); 301 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 302 } 303 304 /// \brief Calculates the approximate reciprocals of the square roots of the 305 /// values stored in a 128-bit vector of [4 x float]. 306 /// 307 /// \headerfile <x86intrin.h> 308 /// 309 /// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions. 310 /// 311 /// \param __a 312 /// A 128-bit vector of [4 x float]. 313 /// \returns A 128-bit vector of [4 x float] containing the approximate 314 /// reciprocals of the square roots of the values in the operand. 315 static __inline__ __m128 __DEFAULT_FN_ATTRS 316 _mm_rsqrt_ps(__m128 __a) 317 { 318 return __builtin_ia32_rsqrtps((__v4sf)__a); 319 } 320 321 /// \brief Compares two 32-bit float values in the low-order bits of both 322 /// operands and returns the lesser value in the low-order bits of the 323 /// vector of [4 x float]. 324 /// 325 /// \headerfile <x86intrin.h> 326 /// 327 /// This intrinsic corresponds to the \c VMINSS / MINSS instructions. 328 /// 329 /// \param __a 330 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 331 /// 32 bits of this operand are used in the comparison. 332 /// \param __b 333 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 334 /// 32 bits of this operand are used in the comparison. 335 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 336 /// minimum value between both operands. The upper 96 bits are copied from 337 /// the upper 96 bits of the first source operand. 338 static __inline__ __m128 __DEFAULT_FN_ATTRS 339 _mm_min_ss(__m128 __a, __m128 __b) 340 { 341 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); 342 } 343 344 /// \brief Compares two 128-bit vectors of [4 x float] and returns the 345 /// lesser of each pair of values. 346 /// 347 /// \headerfile <x86intrin.h> 348 /// 349 /// This intrinsic corresponds to the \c VMINPS / MINPS instructions. 350 /// 351 /// \param __a 352 /// A 128-bit vector of [4 x float] containing one of the operands. 353 /// \param __b 354 /// A 128-bit vector of [4 x float] containing one of the operands. 355 /// \returns A 128-bit vector of [4 x float] containing the minimum values 356 /// between both operands. 357 static __inline__ __m128 __DEFAULT_FN_ATTRS 358 _mm_min_ps(__m128 __a, __m128 __b) 359 { 360 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); 361 } 362 363 /// \brief Compares two 32-bit float values in the low-order bits of both 364 /// operands and returns the greater value in the low-order bits of 365 /// a vector [4 x float]. 366 /// 367 /// \headerfile <x86intrin.h> 368 /// 369 /// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions. 370 /// 371 /// \param __a 372 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 373 /// 32 bits of this operand are used in the comparison. 374 /// \param __b 375 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 376 /// 32 bits of this operand are used in the comparison. 377 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 378 /// maximum value between both operands. The upper 96 bits are copied from 379 /// the upper 96 bits of the first source operand. 380 static __inline__ __m128 __DEFAULT_FN_ATTRS 381 _mm_max_ss(__m128 __a, __m128 __b) 382 { 383 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); 384 } 385 386 /// \brief Compares two 128-bit vectors of [4 x float] and returns the greater 387 /// of each pair of values. 388 /// 389 /// \headerfile <x86intrin.h> 390 /// 391 /// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions. 392 /// 393 /// \param __a 394 /// A 128-bit vector of [4 x float] containing one of the operands. 395 /// \param __b 396 /// A 128-bit vector of [4 x float] containing one of the operands. 397 /// \returns A 128-bit vector of [4 x float] containing the maximum values 398 /// between both operands. 399 static __inline__ __m128 __DEFAULT_FN_ATTRS 400 _mm_max_ps(__m128 __a, __m128 __b) 401 { 402 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); 403 } 404 405 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float]. 406 /// 407 /// \headerfile <x86intrin.h> 408 /// 409 /// This intrinsic corresponds to the \c VANDPS / ANDPS instructions. 410 /// 411 /// \param __a 412 /// A 128-bit vector containing one of the source operands. 413 /// \param __b 414 /// A 128-bit vector containing one of the source operands. 415 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 416 /// values between both operands. 417 static __inline__ __m128 __DEFAULT_FN_ATTRS 418 _mm_and_ps(__m128 __a, __m128 __b) 419 { 420 return (__m128)((__v4su)__a & (__v4su)__b); 421 } 422 423 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using 424 /// the one's complement of the values contained in the first source 425 /// operand. 426 /// 427 /// \headerfile <x86intrin.h> 428 /// 429 /// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions. 430 /// 431 /// \param __a 432 /// A 128-bit vector of [4 x float] containing the first source operand. The 433 /// one's complement of this value is used in the bitwise AND. 434 /// \param __b 435 /// A 128-bit vector of [4 x float] containing the second source operand. 436 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 437 /// one's complement of the first operand and the values in the second 438 /// operand. 439 static __inline__ __m128 __DEFAULT_FN_ATTRS 440 _mm_andnot_ps(__m128 __a, __m128 __b) 441 { 442 return (__m128)(~(__v4su)__a & (__v4su)__b); 443 } 444 445 /// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float]. 446 /// 447 /// \headerfile <x86intrin.h> 448 /// 449 /// This intrinsic corresponds to the \c VORPS / ORPS instructions. 450 /// 451 /// \param __a 452 /// A 128-bit vector of [4 x float] containing one of the source operands. 453 /// \param __b 454 /// A 128-bit vector of [4 x float] containing one of the source operands. 455 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the 456 /// values between both operands. 457 static __inline__ __m128 __DEFAULT_FN_ATTRS 458 _mm_or_ps(__m128 __a, __m128 __b) 459 { 460 return (__m128)((__v4su)__a | (__v4su)__b); 461 } 462 463 /// \brief Performs a bitwise exclusive OR of two 128-bit vectors of 464 /// [4 x float]. 465 /// 466 /// \headerfile <x86intrin.h> 467 /// 468 /// This intrinsic corresponds to the \c VXORPS / XORPS instructions. 469 /// 470 /// \param __a 471 /// A 128-bit vector of [4 x float] containing one of the source operands. 472 /// \param __b 473 /// A 128-bit vector of [4 x float] containing one of the source operands. 474 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR 475 /// of the values between both operands. 476 static __inline__ __m128 __DEFAULT_FN_ATTRS 477 _mm_xor_ps(__m128 __a, __m128 __b) 478 { 479 return (__m128)((__v4su)__a ^ (__v4su)__b); 480 } 481 482 /// \brief Compares two 32-bit float values in the low-order bits of both 483 /// operands for equality and returns the result of the comparison in the 484 /// low-order bits of a vector [4 x float]. 485 /// 486 /// \headerfile <x86intrin.h> 487 /// 488 /// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions. 489 /// 490 /// \param __a 491 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 492 /// 32 bits of this operand are used in the comparison. 493 /// \param __b 494 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 495 /// 32 bits of this operand are used in the comparison. 496 /// \returns A 128-bit vector of [4 x float] containing the comparison results 497 /// in the low-order bits. 498 static __inline__ __m128 __DEFAULT_FN_ATTRS 499 _mm_cmpeq_ss(__m128 __a, __m128 __b) 500 { 501 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); 502 } 503 504 /// \brief Compares each of the corresponding 32-bit float values of the 505 /// 128-bit vectors of [4 x float] for equality. 506 /// 507 /// \headerfile <x86intrin.h> 508 /// 509 /// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions. 510 /// 511 /// \param __a 512 /// A 128-bit vector of [4 x float]. 513 /// \param __b 514 /// A 128-bit vector of [4 x float]. 515 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 516 static __inline__ __m128 __DEFAULT_FN_ATTRS 517 _mm_cmpeq_ps(__m128 __a, __m128 __b) 518 { 519 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); 520 } 521 522 /// \brief Compares two 32-bit float values in the low-order bits of both 523 /// operands to determine if the value in the first operand is less than the 524 /// corresponding value in the second operand and returns the result of the 525 /// comparison in the low-order bits of a vector of [4 x float]. 526 /// 527 /// \headerfile <x86intrin.h> 528 /// 529 /// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions. 530 /// 531 /// \param __a 532 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 533 /// 32 bits of this operand are used in the comparison. 534 /// \param __b 535 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 536 /// 32 bits of this operand are used in the comparison. 537 /// \returns A 128-bit vector of [4 x float] containing the comparison results 538 /// in the low-order bits. 539 static __inline__ __m128 __DEFAULT_FN_ATTRS 540 _mm_cmplt_ss(__m128 __a, __m128 __b) 541 { 542 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); 543 } 544 545 /// \brief Compares each of the corresponding 32-bit float values of the 546 /// 128-bit vectors of [4 x float] to determine if the values in the first 547 /// operand are less than those in the second operand. 548 /// 549 /// \headerfile <x86intrin.h> 550 /// 551 /// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions. 552 /// 553 /// \param __a 554 /// A 128-bit vector of [4 x float]. 555 /// \param __b 556 /// A 128-bit vector of [4 x float]. 557 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 558 static __inline__ __m128 __DEFAULT_FN_ATTRS 559 _mm_cmplt_ps(__m128 __a, __m128 __b) 560 { 561 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); 562 } 563 564 /// \brief Compares two 32-bit float values in the low-order bits of both 565 /// operands to determine if the value in the first operand is less than or 566 /// equal to the corresponding value in the second operand and returns the 567 /// result of the comparison in the low-order bits of a vector of 568 /// [4 x float]. 569 /// 570 /// \headerfile <x86intrin.h> 571 /// 572 /// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions. 573 /// 574 /// \param __a 575 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 576 /// 32 bits of this operand are used in the comparison. 577 /// \param __b 578 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 579 /// 32 bits of this operand are used in the comparison. 580 /// \returns A 128-bit vector of [4 x float] containing the comparison results 581 /// in the low-order bits. 582 static __inline__ __m128 __DEFAULT_FN_ATTRS 583 _mm_cmple_ss(__m128 __a, __m128 __b) 584 { 585 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); 586 } 587 588 /// \brief Compares each of the corresponding 32-bit float values of the 589 /// 128-bit vectors of [4 x float] to determine if the values in the first 590 /// operand are less than or equal to those in the second operand. 591 /// 592 /// \headerfile <x86intrin.h> 593 /// 594 /// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions. 595 /// 596 /// \param __a 597 /// A 128-bit vector of [4 x float]. 598 /// \param __b 599 /// A 128-bit vector of [4 x float]. 600 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 601 static __inline__ __m128 __DEFAULT_FN_ATTRS 602 _mm_cmple_ps(__m128 __a, __m128 __b) 603 { 604 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); 605 } 606 607 /// \brief Compares two 32-bit float values in the low-order bits of both 608 /// operands to determine if the value in the first operand is greater than 609 /// the corresponding value in the second operand and returns the result of 610 /// the comparison in the low-order bits of a vector of [4 x float]. 611 /// 612 /// \headerfile <x86intrin.h> 613 /// 614 /// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions. 615 /// 616 /// \param __a 617 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 618 /// 32 bits of this operand are used in the comparison. 619 /// \param __b 620 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 621 /// 32 bits of this operand are used in the comparison. 622 /// \returns A 128-bit vector of [4 x float] containing the comparison results 623 /// in the low-order bits. 624 static __inline__ __m128 __DEFAULT_FN_ATTRS 625 _mm_cmpgt_ss(__m128 __a, __m128 __b) 626 { 627 return (__m128)__builtin_shufflevector((__v4sf)__a, 628 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a), 629 4, 1, 2, 3); 630 } 631 632 /// \brief Compares each of the corresponding 32-bit float values of the 633 /// 128-bit vectors of [4 x float] to determine if the values in the first 634 /// operand are greater than those in the second operand. 635 /// 636 /// \headerfile <x86intrin.h> 637 /// 638 /// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions. 639 /// 640 /// \param __a 641 /// A 128-bit vector of [4 x float]. 642 /// \param __b 643 /// A 128-bit vector of [4 x float]. 644 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 645 static __inline__ __m128 __DEFAULT_FN_ATTRS 646 _mm_cmpgt_ps(__m128 __a, __m128 __b) 647 { 648 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); 649 } 650 651 /// \brief Compares two 32-bit float values in the low-order bits of both 652 /// operands to determine if the value in the first operand is greater than 653 /// or equal to the corresponding value in the second operand and returns 654 /// the result of the comparison in the low-order bits of a vector of 655 /// [4 x float]. 656 /// 657 /// \headerfile <x86intrin.h> 658 /// 659 /// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions. 660 /// 661 /// \param __a 662 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 663 /// 32 bits of this operand are used in the comparison. 664 /// \param __b 665 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 666 /// 32 bits of this operand are used in the comparison. 667 /// \returns A 128-bit vector of [4 x float] containing the comparison results 668 /// in the low-order bits. 669 static __inline__ __m128 __DEFAULT_FN_ATTRS 670 _mm_cmpge_ss(__m128 __a, __m128 __b) 671 { 672 return (__m128)__builtin_shufflevector((__v4sf)__a, 673 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a), 674 4, 1, 2, 3); 675 } 676 677 /// \brief Compares each of the corresponding 32-bit float values of the 678 /// 128-bit vectors of [4 x float] to determine if the values in the first 679 /// operand are greater than or equal to those in the second operand. 680 /// 681 /// \headerfile <x86intrin.h> 682 /// 683 /// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions. 684 /// 685 /// \param __a 686 /// A 128-bit vector of [4 x float]. 687 /// \param __b 688 /// A 128-bit vector of [4 x float]. 689 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 690 static __inline__ __m128 __DEFAULT_FN_ATTRS 691 _mm_cmpge_ps(__m128 __a, __m128 __b) 692 { 693 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); 694 } 695 696 /// \brief Compares two 32-bit float values in the low-order bits of both 697 /// operands for inequality and returns the result of the comparison in the 698 /// low-order bits of a vector of [4 x float]. 699 /// 700 /// \headerfile <x86intrin.h> 701 /// 702 /// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions. 703 /// 704 /// \param __a 705 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 706 /// 32 bits of this operand are used in the comparison. 707 /// \param __b 708 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 709 /// 32 bits of this operand are used in the comparison. 710 /// \returns A 128-bit vector of [4 x float] containing the comparison results 711 /// in the low-order bits. 712 static __inline__ __m128 __DEFAULT_FN_ATTRS 713 _mm_cmpneq_ss(__m128 __a, __m128 __b) 714 { 715 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); 716 } 717 718 /// \brief Compares each of the corresponding 32-bit float values of the 719 /// 128-bit vectors of [4 x float] for inequality. 720 /// 721 /// \headerfile <x86intrin.h> 722 /// 723 /// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions. 724 /// 725 /// \param __a 726 /// A 128-bit vector of [4 x float]. 727 /// \param __b 728 /// A 128-bit vector of [4 x float]. 729 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 730 static __inline__ __m128 __DEFAULT_FN_ATTRS 731 _mm_cmpneq_ps(__m128 __a, __m128 __b) 732 { 733 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); 734 } 735 736 /// \brief Compares two 32-bit float values in the low-order bits of both 737 /// operands to determine if the value in the first operand is not less than 738 /// the corresponding value in the second operand and returns the result of 739 /// the comparison in the low-order bits of a vector of [4 x float]. 740 /// 741 /// \headerfile <x86intrin.h> 742 /// 743 /// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions. 744 /// 745 /// \param __a 746 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 747 /// 32 bits of this operand are used in the comparison. 748 /// \param __b 749 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 750 /// 32 bits of this operand are used in the comparison. 751 /// \returns A 128-bit vector of [4 x float] containing the comparison results 752 /// in the low-order bits. 753 static __inline__ __m128 __DEFAULT_FN_ATTRS 754 _mm_cmpnlt_ss(__m128 __a, __m128 __b) 755 { 756 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); 757 } 758 759 /// \brief Compares each of the corresponding 32-bit float values of the 760 /// 128-bit vectors of [4 x float] to determine if the values in the first 761 /// operand are not less than those in the second operand. 762 /// 763 /// \headerfile <x86intrin.h> 764 /// 765 /// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions. 766 /// 767 /// \param __a 768 /// A 128-bit vector of [4 x float]. 769 /// \param __b 770 /// A 128-bit vector of [4 x float]. 771 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 772 static __inline__ __m128 __DEFAULT_FN_ATTRS 773 _mm_cmpnlt_ps(__m128 __a, __m128 __b) 774 { 775 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); 776 } 777 778 /// \brief Compares two 32-bit float values in the low-order bits of both 779 /// operands to determine if the value in the first operand is not less than 780 /// or equal to the corresponding value in the second operand and returns 781 /// the result of the comparison in the low-order bits of a vector of 782 /// [4 x float]. 783 /// 784 /// \headerfile <x86intrin.h> 785 /// 786 /// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions. 787 /// 788 /// \param __a 789 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 790 /// 32 bits of this operand are used in the comparison. 791 /// \param __b 792 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 793 /// 32 bits of this operand are used in the comparison. 794 /// \returns A 128-bit vector of [4 x float] containing the comparison results 795 /// in the low-order bits. 796 static __inline__ __m128 __DEFAULT_FN_ATTRS 797 _mm_cmpnle_ss(__m128 __a, __m128 __b) 798 { 799 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); 800 } 801 802 /// \brief Compares each of the corresponding 32-bit float values of the 803 /// 128-bit vectors of [4 x float] to determine if the values in the first 804 /// operand are not less than or equal to those in the second operand. 805 /// 806 /// \headerfile <x86intrin.h> 807 /// 808 /// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions. 809 /// 810 /// \param __a 811 /// A 128-bit vector of [4 x float]. 812 /// \param __b 813 /// A 128-bit vector of [4 x float]. 814 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 815 static __inline__ __m128 __DEFAULT_FN_ATTRS 816 _mm_cmpnle_ps(__m128 __a, __m128 __b) 817 { 818 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); 819 } 820 821 /// \brief Compares two 32-bit float values in the low-order bits of both 822 /// operands to determine if the value in the first operand is not greater 823 /// than the corresponding value in the second operand and returns the 824 /// result of the comparison in the low-order bits of a vector of 825 /// [4 x float]. 826 /// 827 /// \headerfile <x86intrin.h> 828 /// 829 /// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions. 830 /// 831 /// \param __a 832 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 833 /// 32 bits of this operand are used in the comparison. 834 /// \param __b 835 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 836 /// 32 bits of this operand are used in the comparison. 837 /// \returns A 128-bit vector of [4 x float] containing the comparison results 838 /// in the low-order bits. 839 static __inline__ __m128 __DEFAULT_FN_ATTRS 840 _mm_cmpngt_ss(__m128 __a, __m128 __b) 841 { 842 return (__m128)__builtin_shufflevector((__v4sf)__a, 843 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a), 844 4, 1, 2, 3); 845 } 846 847 /// \brief Compares each of the corresponding 32-bit float values of the 848 /// 128-bit vectors of [4 x float] to determine if the values in the first 849 /// operand are not greater than those in the second operand. 850 /// 851 /// \headerfile <x86intrin.h> 852 /// 853 /// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions. 854 /// 855 /// \param __a 856 /// A 128-bit vector of [4 x float]. 857 /// \param __b 858 /// A 128-bit vector of [4 x float]. 859 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 860 static __inline__ __m128 __DEFAULT_FN_ATTRS 861 _mm_cmpngt_ps(__m128 __a, __m128 __b) 862 { 863 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); 864 } 865 866 /// \brief Compares two 32-bit float values in the low-order bits of both 867 /// operands to determine if the value in the first operand is not greater 868 /// than or equal to the corresponding value in the second operand and 869 /// returns the result of the comparison in the low-order bits of a vector 870 /// of [4 x float]. 871 /// 872 /// \headerfile <x86intrin.h> 873 /// 874 /// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions. 875 /// 876 /// \param __a 877 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 878 /// 32 bits of this operand are used in the comparison. 879 /// \param __b 880 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 881 /// 32 bits of this operand are used in the comparison. 882 /// \returns A 128-bit vector of [4 x float] containing the comparison results 883 /// in the low-order bits. 884 static __inline__ __m128 __DEFAULT_FN_ATTRS 885 _mm_cmpnge_ss(__m128 __a, __m128 __b) 886 { 887 return (__m128)__builtin_shufflevector((__v4sf)__a, 888 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a), 889 4, 1, 2, 3); 890 } 891 892 /// \brief Compares each of the corresponding 32-bit float values of the 893 /// 128-bit vectors of [4 x float] to determine if the values in the first 894 /// operand are not greater than or equal to those in the second operand. 895 /// 896 /// \headerfile <x86intrin.h> 897 /// 898 /// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions. 899 /// 900 /// \param __a 901 /// A 128-bit vector of [4 x float]. 902 /// \param __b 903 /// A 128-bit vector of [4 x float]. 904 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 905 static __inline__ __m128 __DEFAULT_FN_ATTRS 906 _mm_cmpnge_ps(__m128 __a, __m128 __b) 907 { 908 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); 909 } 910 911 /// \brief Compares two 32-bit float values in the low-order bits of both 912 /// operands to determine if the value in the first operand is ordered with 913 /// respect to the corresponding value in the second operand and returns the 914 /// result of the comparison in the low-order bits of a vector of 915 /// [4 x float]. 916 /// 917 /// \headerfile <x86intrin.h> 918 /// 919 /// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions. 920 /// 921 /// \param __a 922 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 923 /// 32 bits of this operand are used in the comparison. 924 /// \param __b 925 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 926 /// 32 bits of this operand are used in the comparison. 927 /// \returns A 128-bit vector of [4 x float] containing the comparison results 928 /// in the low-order bits. 929 static __inline__ __m128 __DEFAULT_FN_ATTRS 930 _mm_cmpord_ss(__m128 __a, __m128 __b) 931 { 932 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); 933 } 934 935 /// \brief Compares each of the corresponding 32-bit float values of the 936 /// 128-bit vectors of [4 x float] to determine if the values in the first 937 /// operand are ordered with respect to those in the second operand. 938 /// 939 /// \headerfile <x86intrin.h> 940 /// 941 /// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions. 942 /// 943 /// \param __a 944 /// A 128-bit vector of [4 x float]. 945 /// \param __b 946 /// A 128-bit vector of [4 x float]. 947 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 948 static __inline__ __m128 __DEFAULT_FN_ATTRS 949 _mm_cmpord_ps(__m128 __a, __m128 __b) 950 { 951 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); 952 } 953 954 /// \brief Compares two 32-bit float values in the low-order bits of both 955 /// operands to determine if the value in the first operand is unordered 956 /// with respect to the corresponding value in the second operand and 957 /// returns the result of the comparison in the low-order bits of a vector 958 /// of [4 x float]. 959 /// 960 /// \headerfile <x86intrin.h> 961 /// 962 /// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions. 963 /// 964 /// \param __a 965 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 966 /// 32 bits of this operand are used in the comparison. 967 /// \param __b 968 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 969 /// 32 bits of this operand are used in the comparison. 970 /// \returns A 128-bit vector of [4 x float] containing the comparison results 971 /// in the low-order bits. 972 static __inline__ __m128 __DEFAULT_FN_ATTRS 973 _mm_cmpunord_ss(__m128 __a, __m128 __b) 974 { 975 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); 976 } 977 978 /// \brief Compares each of the corresponding 32-bit float values of the 979 /// 128-bit vectors of [4 x float] to determine if the values in the first 980 /// operand are unordered with respect to those in the second operand. 981 /// 982 /// \headerfile <x86intrin.h> 983 /// 984 /// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions. 985 /// 986 /// \param __a 987 /// A 128-bit vector of [4 x float]. 988 /// \param __b 989 /// A 128-bit vector of [4 x float]. 990 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 991 static __inline__ __m128 __DEFAULT_FN_ATTRS 992 _mm_cmpunord_ps(__m128 __a, __m128 __b) 993 { 994 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); 995 } 996 997 /// \brief Compares two 32-bit float values in the low-order bits of both 998 /// operands for equality and returns the result of the comparison. 999 /// 1000 /// \headerfile <x86intrin.h> 1001 /// 1002 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1003 /// 1004 /// \param __a 1005 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1006 /// used in the comparison. 1007 /// \param __b 1008 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1009 /// used in the comparison. 1010 /// \returns An integer containing the comparison results. 1011 static __inline__ int __DEFAULT_FN_ATTRS 1012 _mm_comieq_ss(__m128 __a, __m128 __b) 1013 { 1014 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); 1015 } 1016 1017 /// \brief Compares two 32-bit float values in the low-order bits of both 1018 /// operands to determine if the first operand is less than the second 1019 /// operand and returns the result of the comparison. 1020 /// 1021 /// \headerfile <x86intrin.h> 1022 /// 1023 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1024 /// 1025 /// \param __a 1026 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1027 /// used in the comparison. 1028 /// \param __b 1029 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1030 /// used in the comparison. 1031 /// \returns An integer containing the comparison results. 1032 static __inline__ int __DEFAULT_FN_ATTRS 1033 _mm_comilt_ss(__m128 __a, __m128 __b) 1034 { 1035 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); 1036 } 1037 1038 /// \brief Compares two 32-bit float values in the low-order bits of both 1039 /// operands to determine if the first operand is less than or equal to the 1040 /// second operand and returns the result of the comparison. 1041 /// 1042 /// \headerfile <x86intrin.h> 1043 /// 1044 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1045 /// 1046 /// \param __a 1047 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1048 /// used in the comparison. 1049 /// \param __b 1050 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1051 /// used in the comparison. 1052 /// \returns An integer containing the comparison results. 1053 static __inline__ int __DEFAULT_FN_ATTRS 1054 _mm_comile_ss(__m128 __a, __m128 __b) 1055 { 1056 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); 1057 } 1058 1059 /// \brief Compares two 32-bit float values in the low-order bits of both 1060 /// operands to determine if the first operand is greater than the second 1061 /// operand and returns the result of the comparison. 1062 /// 1063 /// \headerfile <x86intrin.h> 1064 /// 1065 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1066 /// 1067 /// \param __a 1068 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1069 /// used in the comparison. 1070 /// \param __b 1071 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1072 /// used in the comparison. 1073 /// \returns An integer containing the comparison results. 1074 static __inline__ int __DEFAULT_FN_ATTRS 1075 _mm_comigt_ss(__m128 __a, __m128 __b) 1076 { 1077 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); 1078 } 1079 1080 /// \brief Compares two 32-bit float values in the low-order bits of both 1081 /// operands to determine if the first operand is greater than or equal to 1082 /// the second operand and returns the result of the comparison. 1083 /// 1084 /// \headerfile <x86intrin.h> 1085 /// 1086 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1087 /// 1088 /// \param __a 1089 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1090 /// used in the comparison. 1091 /// \param __b 1092 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1093 /// used in the comparison. 1094 /// \returns An integer containing the comparison results. 1095 static __inline__ int __DEFAULT_FN_ATTRS 1096 _mm_comige_ss(__m128 __a, __m128 __b) 1097 { 1098 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); 1099 } 1100 1101 /// \brief Compares two 32-bit float values in the low-order bits of both 1102 /// operands to determine if the first operand is not equal to the second 1103 /// operand and returns the result of the comparison. 1104 /// 1105 /// \headerfile <x86intrin.h> 1106 /// 1107 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1108 /// 1109 /// \param __a 1110 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1111 /// used in the comparison. 1112 /// \param __b 1113 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1114 /// used in the comparison. 1115 /// \returns An integer containing the comparison results. 1116 static __inline__ int __DEFAULT_FN_ATTRS 1117 _mm_comineq_ss(__m128 __a, __m128 __b) 1118 { 1119 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); 1120 } 1121 1122 /// \brief Performs an unordered comparison of two 32-bit float values using 1123 /// the low-order bits of both operands to determine equality and returns 1124 /// the result of the comparison. 1125 /// 1126 /// \headerfile <x86intrin.h> 1127 /// 1128 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1129 /// 1130 /// \param __a 1131 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1132 /// used in the comparison. 1133 /// \param __b 1134 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1135 /// used in the comparison. 1136 /// \returns An integer containing the comparison results. 1137 static __inline__ int __DEFAULT_FN_ATTRS 1138 _mm_ucomieq_ss(__m128 __a, __m128 __b) 1139 { 1140 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); 1141 } 1142 1143 /// \brief Performs an unordered comparison of two 32-bit float values using 1144 /// the low-order bits of both operands to determine if the first operand is 1145 /// less than the second operand and returns the result of the comparison. 1146 /// 1147 /// \headerfile <x86intrin.h> 1148 /// 1149 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1150 /// 1151 /// \param __a 1152 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1153 /// used in the comparison. 1154 /// \param __b 1155 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1156 /// used in the comparison. 1157 /// \returns An integer containing the comparison results. 1158 static __inline__ int __DEFAULT_FN_ATTRS 1159 _mm_ucomilt_ss(__m128 __a, __m128 __b) 1160 { 1161 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); 1162 } 1163 1164 /// \brief Performs an unordered comparison of two 32-bit float values using 1165 /// the low-order bits of both operands to determine if the first operand 1166 /// is less than or equal to the second operand and returns the result of 1167 /// the comparison. 1168 /// 1169 /// \headerfile <x86intrin.h> 1170 /// 1171 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1172 /// 1173 /// \param __a 1174 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1175 /// used in the comparison. 1176 /// \param __b 1177 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1178 /// used in the comparison. 1179 /// \returns An integer containing the comparison results. 1180 static __inline__ int __DEFAULT_FN_ATTRS 1181 _mm_ucomile_ss(__m128 __a, __m128 __b) 1182 { 1183 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); 1184 } 1185 1186 /// \brief Performs an unordered comparison of two 32-bit float values using 1187 /// the low-order bits of both operands to determine if the first operand 1188 /// is greater than the second operand and returns the result of the 1189 /// comparison. 1190 /// 1191 /// \headerfile <x86intrin.h> 1192 /// 1193 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1194 /// 1195 /// \param __a 1196 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1197 /// used in the comparison. 1198 /// \param __b 1199 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1200 /// used in the comparison. 1201 /// \returns An integer containing the comparison results. 1202 static __inline__ int __DEFAULT_FN_ATTRS 1203 _mm_ucomigt_ss(__m128 __a, __m128 __b) 1204 { 1205 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); 1206 } 1207 1208 /// \brief Performs an unordered comparison of two 32-bit float values using 1209 /// the low-order bits of both operands to determine if the first operand is 1210 /// greater than or equal to the second operand and returns the result of 1211 /// the comparison. 1212 /// 1213 /// \headerfile <x86intrin.h> 1214 /// 1215 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1216 /// 1217 /// \param __a 1218 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1219 /// used in the comparison. 1220 /// \param __b 1221 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1222 /// used in the comparison. 1223 /// \returns An integer containing the comparison results. 1224 static __inline__ int __DEFAULT_FN_ATTRS 1225 _mm_ucomige_ss(__m128 __a, __m128 __b) 1226 { 1227 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); 1228 } 1229 1230 /// \brief Performs an unordered comparison of two 32-bit float values using 1231 /// the low-order bits of both operands to determine inequality and returns 1232 /// the result of the comparison. 1233 /// 1234 /// \headerfile <x86intrin.h> 1235 /// 1236 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1237 /// 1238 /// \param __a 1239 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1240 /// used in the comparison. 1241 /// \param __b 1242 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1243 /// used in the comparison. 1244 /// \returns An integer containing the comparison results. 1245 static __inline__ int __DEFAULT_FN_ATTRS 1246 _mm_ucomineq_ss(__m128 __a, __m128 __b) 1247 { 1248 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); 1249 } 1250 1251 /// \brief Converts a float value contained in the lower 32 bits of a vector of 1252 /// [4 x float] into a 32-bit integer. 1253 /// 1254 /// \headerfile <x86intrin.h> 1255 /// 1256 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions. 1257 /// 1258 /// \param __a 1259 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1260 /// used in the conversion. 1261 /// \returns A 32-bit integer containing the converted value. 1262 static __inline__ int __DEFAULT_FN_ATTRS 1263 _mm_cvtss_si32(__m128 __a) 1264 { 1265 return __builtin_ia32_cvtss2si((__v4sf)__a); 1266 } 1267 1268 /// \brief Converts a float value contained in the lower 32 bits of a vector of 1269 /// [4 x float] into a 32-bit integer. 1270 /// 1271 /// \headerfile <x86intrin.h> 1272 /// 1273 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions. 1274 /// 1275 /// \param __a 1276 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1277 /// used in the conversion. 1278 /// \returns A 32-bit integer containing the converted value. 1279 static __inline__ int __DEFAULT_FN_ATTRS 1280 _mm_cvt_ss2si(__m128 __a) 1281 { 1282 return _mm_cvtss_si32(__a); 1283 } 1284 1285 #ifdef __x86_64__ 1286 1287 /// \brief Converts a float value contained in the lower 32 bits of a vector of 1288 /// [4 x float] into a 64-bit integer. 1289 /// 1290 /// \headerfile <x86intrin.h> 1291 /// 1292 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions. 1293 /// 1294 /// \param __a 1295 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1296 /// used in the conversion. 1297 /// \returns A 64-bit integer containing the converted value. 1298 static __inline__ long long __DEFAULT_FN_ATTRS 1299 _mm_cvtss_si64(__m128 __a) 1300 { 1301 return __builtin_ia32_cvtss2si64((__v4sf)__a); 1302 } 1303 1304 #endif 1305 1306 /// \brief Converts two low-order float values in a 128-bit vector of 1307 /// [4 x float] into a 64-bit vector of [2 x i32]. 1308 /// 1309 /// \headerfile <x86intrin.h> 1310 /// 1311 /// This intrinsic corresponds to the \c CVTPS2PI instruction. 1312 /// 1313 /// \param __a 1314 /// A 128-bit vector of [4 x float]. 1315 /// \returns A 64-bit integer vector containing the converted values. 1316 static __inline__ __m64 __DEFAULT_FN_ATTRS 1317 _mm_cvtps_pi32(__m128 __a) 1318 { 1319 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); 1320 } 1321 1322 /// \brief Converts two low-order float values in a 128-bit vector of 1323 /// [4 x float] into a 64-bit vector of [2 x i32]. 1324 /// 1325 /// \headerfile <x86intrin.h> 1326 /// 1327 /// This intrinsic corresponds to the \c CVTPS2PI instruction. 1328 /// 1329 /// \param __a 1330 /// A 128-bit vector of [4 x float]. 1331 /// \returns A 64-bit integer vector containing the converted values. 1332 static __inline__ __m64 __DEFAULT_FN_ATTRS 1333 _mm_cvt_ps2pi(__m128 __a) 1334 { 1335 return _mm_cvtps_pi32(__a); 1336 } 1337 1338 /// \brief Converts a float value contained in the lower 32 bits of a vector of 1339 /// [4 x float] into a 32-bit integer, truncating the result when it is 1340 /// inexact. 1341 /// 1342 /// \headerfile <x86intrin.h> 1343 /// 1344 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions. 1345 /// 1346 /// \param __a 1347 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1348 /// used in the conversion. 1349 /// \returns A 32-bit integer containing the converted value. 1350 static __inline__ int __DEFAULT_FN_ATTRS 1351 _mm_cvttss_si32(__m128 __a) 1352 { 1353 return __a[0]; 1354 } 1355 1356 /// \brief Converts a float value contained in the lower 32 bits of a vector of 1357 /// [4 x float] into a 32-bit integer, truncating the result when it is 1358 /// inexact. 1359 /// 1360 /// \headerfile <x86intrin.h> 1361 /// 1362 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions. 1363 /// 1364 /// \param __a 1365 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1366 /// used in the conversion. 1367 /// \returns A 32-bit integer containing the converted value. 1368 static __inline__ int __DEFAULT_FN_ATTRS 1369 _mm_cvtt_ss2si(__m128 __a) 1370 { 1371 return _mm_cvttss_si32(__a); 1372 } 1373 1374 /// \brief Converts a float value contained in the lower 32 bits of a vector of 1375 /// [4 x float] into a 64-bit integer, truncating the result when it is 1376 /// inexact. 1377 /// 1378 /// \headerfile <x86intrin.h> 1379 /// 1380 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions. 1381 /// 1382 /// \param __a 1383 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1384 /// used in the conversion. 1385 /// \returns A 64-bit integer containing the converted value. 1386 static __inline__ long long __DEFAULT_FN_ATTRS 1387 _mm_cvttss_si64(__m128 __a) 1388 { 1389 return __a[0]; 1390 } 1391 1392 /// \brief Converts two low-order float values in a 128-bit vector of 1393 /// [4 x float] into a 64-bit vector of [2 x i32], truncating the result 1394 /// when it is inexact. 1395 /// 1396 /// \headerfile <x86intrin.h> 1397 /// 1398 /// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions. 1399 /// 1400 /// \param __a 1401 /// A 128-bit vector of [4 x float]. 1402 /// \returns A 64-bit integer vector containing the converted values. 1403 static __inline__ __m64 __DEFAULT_FN_ATTRS 1404 _mm_cvttps_pi32(__m128 __a) 1405 { 1406 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); 1407 } 1408 1409 /// \brief Converts two low-order float values in a 128-bit vector of [4 x 1410 /// float] into a 64-bit vector of [2 x i32], truncating the result when it 1411 /// is inexact. 1412 /// 1413 /// \headerfile <x86intrin.h> 1414 /// 1415 /// This intrinsic corresponds to the \c CVTTPS2PI instruction. 1416 /// 1417 /// \param __a 1418 /// A 128-bit vector of [4 x float]. 1419 /// \returns A 64-bit integer vector containing the converted values. 1420 static __inline__ __m64 __DEFAULT_FN_ATTRS 1421 _mm_cvtt_ps2pi(__m128 __a) 1422 { 1423 return _mm_cvttps_pi32(__a); 1424 } 1425 1426 /// \brief Converts a 32-bit signed integer value into a floating point value 1427 /// and writes it to the lower 32 bits of the destination. The remaining 1428 /// higher order elements of the destination vector are copied from the 1429 /// corresponding elements in the first operand. 1430 /// 1431 /// \headerfile <x86intrin.h> 1432 /// 1433 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction. 1434 /// 1435 /// \param __a 1436 /// A 128-bit vector of [4 x float]. 1437 /// \param __b 1438 /// A 32-bit signed integer operand containing the value to be converted. 1439 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1440 /// converted value of the second operand. The upper 96 bits are copied from 1441 /// the upper 96 bits of the first operand. 1442 static __inline__ __m128 __DEFAULT_FN_ATTRS 1443 _mm_cvtsi32_ss(__m128 __a, int __b) 1444 { 1445 __a[0] = __b; 1446 return __a; 1447 } 1448 1449 /// \brief Converts a 32-bit signed integer value into a floating point value 1450 /// and writes it to the lower 32 bits of the destination. The remaining 1451 /// higher order elements of the destination are copied from the 1452 /// corresponding elements in the first operand. 1453 /// 1454 /// \headerfile <x86intrin.h> 1455 /// 1456 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction. 1457 /// 1458 /// \param __a 1459 /// A 128-bit vector of [4 x float]. 1460 /// \param __b 1461 /// A 32-bit signed integer operand containing the value to be converted. 1462 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1463 /// converted value of the second operand. The upper 96 bits are copied from 1464 /// the upper 96 bits of the first operand. 1465 static __inline__ __m128 __DEFAULT_FN_ATTRS 1466 _mm_cvt_si2ss(__m128 __a, int __b) 1467 { 1468 return _mm_cvtsi32_ss(__a, __b); 1469 } 1470 1471 #ifdef __x86_64__ 1472 1473 /// \brief Converts a 64-bit signed integer value into a floating point value 1474 /// and writes it to the lower 32 bits of the destination. The remaining 1475 /// higher order elements of the destination are copied from the 1476 /// corresponding elements in the first operand. 1477 /// 1478 /// \headerfile <x86intrin.h> 1479 /// 1480 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction. 1481 /// 1482 /// \param __a 1483 /// A 128-bit vector of [4 x float]. 1484 /// \param __b 1485 /// A 64-bit signed integer operand containing the value to be converted. 1486 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1487 /// converted value of the second operand. The upper 96 bits are copied from 1488 /// the upper 96 bits of the first operand. 1489 static __inline__ __m128 __DEFAULT_FN_ATTRS 1490 _mm_cvtsi64_ss(__m128 __a, long long __b) 1491 { 1492 __a[0] = __b; 1493 return __a; 1494 } 1495 1496 #endif 1497 1498 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two 1499 /// floating point values and writes them to the lower 64-bits of the 1500 /// destination. The remaining higher order elements of the destination are 1501 /// copied from the corresponding elements in the first operand. 1502 /// 1503 /// \headerfile <x86intrin.h> 1504 /// 1505 /// This intrinsic corresponds to the \c CVTPI2PS instruction. 1506 /// 1507 /// \param __a 1508 /// A 128-bit vector of [4 x float]. 1509 /// \param __b 1510 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted 1511 /// and written to the corresponding low-order elements in the destination. 1512 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1513 /// converted value of the second operand. The upper 64 bits are copied from 1514 /// the upper 64 bits of the first operand. 1515 static __inline__ __m128 __DEFAULT_FN_ATTRS 1516 _mm_cvtpi32_ps(__m128 __a, __m64 __b) 1517 { 1518 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); 1519 } 1520 1521 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two 1522 /// floating point values and writes them to the lower 64-bits of the 1523 /// destination. The remaining higher order elements of the destination are 1524 /// copied from the corresponding elements in the first operand. 1525 /// 1526 /// \headerfile <x86intrin.h> 1527 /// 1528 /// This intrinsic corresponds to the \c CVTPI2PS instruction. 1529 /// 1530 /// \param __a 1531 /// A 128-bit vector of [4 x float]. 1532 /// \param __b 1533 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted 1534 /// and written to the corresponding low-order elements in the destination. 1535 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1536 /// converted value from the second operand. The upper 64 bits are copied 1537 /// from the upper 64 bits of the first operand. 1538 static __inline__ __m128 __DEFAULT_FN_ATTRS 1539 _mm_cvt_pi2ps(__m128 __a, __m64 __b) 1540 { 1541 return _mm_cvtpi32_ps(__a, __b); 1542 } 1543 1544 /// \brief Extracts a float value contained in the lower 32 bits of a vector of 1545 /// [4 x float]. 1546 /// 1547 /// \headerfile <x86intrin.h> 1548 /// 1549 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction. 1550 /// 1551 /// \param __a 1552 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1553 /// used in the extraction. 1554 /// \returns A 32-bit float containing the extracted value. 1555 static __inline__ float __DEFAULT_FN_ATTRS 1556 _mm_cvtss_f32(__m128 __a) 1557 { 1558 return __a[0]; 1559 } 1560 1561 /// \brief Loads two packed float values from the address __p into the 1562 /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits 1563 /// are copied from the low-order bits of the first operand. 1564 /// 1565 /// \headerfile <x86intrin.h> 1566 /// 1567 /// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction. 1568 /// 1569 /// \param __a 1570 /// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0] 1571 /// of the destination. 1572 /// \param __p 1573 /// A pointer to two packed float values. Bits [63:0] are written to bits 1574 /// [127:64] of the destination. 1575 /// \returns A 128-bit vector of [4 x float] containing the moved values. 1576 static __inline__ __m128 __DEFAULT_FN_ATTRS 1577 _mm_loadh_pi(__m128 __a, const __m64 *__p) 1578 { 1579 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 1580 struct __mm_loadh_pi_struct { 1581 __mm_loadh_pi_v2f32 __u; 1582 } __attribute__((__packed__, __may_alias__)); 1583 __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; 1584 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1585 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 1586 } 1587 1588 /// \brief Loads two packed float values from the address __p into the low-order 1589 /// bits of a 128-bit vector of [4 x float]. The high-order bits are copied 1590 /// from the high-order bits of the first operand. 1591 /// 1592 /// \headerfile <x86intrin.h> 1593 /// 1594 /// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction. 1595 /// 1596 /// \param __a 1597 /// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits 1598 /// [127:64] of the destination. 1599 /// \param __p 1600 /// A pointer to two packed float values. Bits [63:0] are written to bits 1601 /// [63:0] of the destination. 1602 /// \returns A 128-bit vector of [4 x float] containing the moved values. 1603 static __inline__ __m128 __DEFAULT_FN_ATTRS 1604 _mm_loadl_pi(__m128 __a, const __m64 *__p) 1605 { 1606 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 1607 struct __mm_loadl_pi_struct { 1608 __mm_loadl_pi_v2f32 __u; 1609 } __attribute__((__packed__, __may_alias__)); 1610 __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; 1611 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1612 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 1613 } 1614 1615 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower 1616 /// 32 bits of the vector are initialized with the single-precision 1617 /// floating-point value loaded from a specified memory location. The upper 1618 /// 96 bits are set to zero. 1619 /// 1620 /// \headerfile <x86intrin.h> 1621 /// 1622 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction. 1623 /// 1624 /// \param __p 1625 /// A pointer to a 32-bit memory location containing a single-precision 1626 /// floating-point value. 1627 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The 1628 /// lower 32 bits contain the value loaded from the memory location. The 1629 /// upper 96 bits are set to zero. 1630 static __inline__ __m128 __DEFAULT_FN_ATTRS 1631 _mm_load_ss(const float *__p) 1632 { 1633 struct __mm_load_ss_struct { 1634 float __u; 1635 } __attribute__((__packed__, __may_alias__)); 1636 float __u = ((struct __mm_load_ss_struct*)__p)->__u; 1637 return (__m128){ __u, 0, 0, 0 }; 1638 } 1639 1640 /// \brief Loads a 32-bit float value and duplicates it to all four vector 1641 /// elements of a 128-bit vector of [4 x float]. 1642 /// 1643 /// \headerfile <x86intrin.h> 1644 /// 1645 /// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling 1646 /// instruction. 1647 /// 1648 /// \param __p 1649 /// A pointer to a float value to be loaded and duplicated. 1650 /// \returns A 128-bit vector of [4 x float] containing the loaded 1651 /// and duplicated values. 1652 static __inline__ __m128 __DEFAULT_FN_ATTRS 1653 _mm_load1_ps(const float *__p) 1654 { 1655 struct __mm_load1_ps_struct { 1656 float __u; 1657 } __attribute__((__packed__, __may_alias__)); 1658 float __u = ((struct __mm_load1_ps_struct*)__p)->__u; 1659 return (__m128){ __u, __u, __u, __u }; 1660 } 1661 1662 #define _mm_load_ps1(p) _mm_load1_ps(p) 1663 1664 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned 1665 /// memory location. 1666 /// 1667 /// \headerfile <x86intrin.h> 1668 /// 1669 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction. 1670 /// 1671 /// \param __p 1672 /// A pointer to a 128-bit memory location. The address of the memory 1673 /// location has to be 128-bit aligned. 1674 /// \returns A 128-bit vector of [4 x float] containing the loaded valus. 1675 static __inline__ __m128 __DEFAULT_FN_ATTRS 1676 _mm_load_ps(const float *__p) 1677 { 1678 return *(__m128*)__p; 1679 } 1680 1681 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an 1682 /// unaligned memory location. 1683 /// 1684 /// \headerfile <x86intrin.h> 1685 /// 1686 /// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction. 1687 /// 1688 /// \param __p 1689 /// A pointer to a 128-bit memory location. The address of the memory 1690 /// location does not have to be aligned. 1691 /// \returns A 128-bit vector of [4 x float] containing the loaded values. 1692 static __inline__ __m128 __DEFAULT_FN_ATTRS 1693 _mm_loadu_ps(const float *__p) 1694 { 1695 struct __loadu_ps { 1696 __m128 __v; 1697 } __attribute__((__packed__, __may_alias__)); 1698 return ((struct __loadu_ps*)__p)->__v; 1699 } 1700 1701 /// \brief Loads four packed float values, in reverse order, from an aligned 1702 /// memory location to 32-bit elements in a 128-bit vector of [4 x float]. 1703 /// 1704 /// \headerfile <x86intrin.h> 1705 /// 1706 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling 1707 /// instruction. 1708 /// 1709 /// \param __p 1710 /// A pointer to a 128-bit memory location. The address of the memory 1711 /// location has to be 128-bit aligned. 1712 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded 1713 /// in reverse order. 1714 static __inline__ __m128 __DEFAULT_FN_ATTRS 1715 _mm_loadr_ps(const float *__p) 1716 { 1717 __m128 __a = _mm_load_ps(__p); 1718 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 1719 } 1720 1721 /// \brief Create a 128-bit vector of [4 x float] with undefined values. 1722 /// 1723 /// \headerfile <x86intrin.h> 1724 /// 1725 /// This intrinsic has no corresponding instruction. 1726 /// 1727 /// \returns A 128-bit vector of [4 x float] containing undefined values. 1728 1729 static __inline__ __m128 __DEFAULT_FN_ATTRS 1730 _mm_undefined_ps(void) 1731 { 1732 return (__m128)__builtin_ia32_undef128(); 1733 } 1734 1735 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower 1736 /// 32 bits of the vector are initialized with the specified single-precision 1737 /// floating-point value. The upper 96 bits are set to zero. 1738 /// 1739 /// \headerfile <x86intrin.h> 1740 /// 1741 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction. 1742 /// 1743 /// \param __w 1744 /// A single-precision floating-point value used to initialize the lower 32 1745 /// bits of the result. 1746 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The 1747 /// lower 32 bits contain the value provided in the source operand. The 1748 /// upper 96 bits are set to zero. 1749 static __inline__ __m128 __DEFAULT_FN_ATTRS 1750 _mm_set_ss(float __w) 1751 { 1752 return (__m128){ __w, 0, 0, 0 }; 1753 } 1754 1755 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each 1756 /// of the four single-precision floating-point vector elements set to the 1757 /// specified single-precision floating-point value. 1758 /// 1759 /// \headerfile <x86intrin.h> 1760 /// 1761 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. 1762 /// 1763 /// \param __w 1764 /// A single-precision floating-point value used to initialize each vector 1765 /// element of the result. 1766 /// \returns An initialized 128-bit floating-point vector of [4 x float]. 1767 static __inline__ __m128 __DEFAULT_FN_ATTRS 1768 _mm_set1_ps(float __w) 1769 { 1770 return (__m128){ __w, __w, __w, __w }; 1771 } 1772 1773 /* Microsoft specific. */ 1774 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each 1775 /// of the four single-precision floating-point vector elements set to the 1776 /// specified single-precision floating-point value. 1777 /// 1778 /// \headerfile <x86intrin.h> 1779 /// 1780 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. 1781 /// 1782 /// \param __w 1783 /// A single-precision floating-point value used to initialize each vector 1784 /// element of the result. 1785 /// \returns An initialized 128-bit floating-point vector of [4 x float]. 1786 static __inline__ __m128 __DEFAULT_FN_ATTRS 1787 _mm_set_ps1(float __w) 1788 { 1789 return _mm_set1_ps(__w); 1790 } 1791 1792 /// \brief Constructs a 128-bit floating-point vector of [4 x float] 1793 /// initialized with the specified single-precision floating-point values. 1794 /// 1795 /// \headerfile <x86intrin.h> 1796 /// 1797 /// This intrinsic is a utility function and does not correspond to a specific 1798 /// instruction. 1799 /// 1800 /// \param __z 1801 /// A single-precision floating-point value used to initialize bits [127:96] 1802 /// of the result. 1803 /// \param __y 1804 /// A single-precision floating-point value used to initialize bits [95:64] 1805 /// of the result. 1806 /// \param __x 1807 /// A single-precision floating-point value used to initialize bits [63:32] 1808 /// of the result. 1809 /// \param __w 1810 /// A single-precision floating-point value used to initialize bits [31:0] 1811 /// of the result. 1812 /// \returns An initialized 128-bit floating-point vector of [4 x float]. 1813 static __inline__ __m128 __DEFAULT_FN_ATTRS 1814 _mm_set_ps(float __z, float __y, float __x, float __w) 1815 { 1816 return (__m128){ __w, __x, __y, __z }; 1817 } 1818 1819 /// \brief Constructs a 128-bit floating-point vector of [4 x float], 1820 /// initialized in reverse order with the specified 32-bit single-precision 1821 /// float-point values. 1822 /// 1823 /// \headerfile <x86intrin.h> 1824 /// 1825 /// This intrinsic is a utility function and does not correspond to a specific 1826 /// instruction. 1827 /// 1828 /// \param __z 1829 /// A single-precision floating-point value used to initialize bits [31:0] 1830 /// of the result. 1831 /// \param __y 1832 /// A single-precision floating-point value used to initialize bits [63:32] 1833 /// of the result. 1834 /// \param __x 1835 /// A single-precision floating-point value used to initialize bits [95:64] 1836 /// of the result. 1837 /// \param __w 1838 /// A single-precision floating-point value used to initialize bits [127:96] 1839 /// of the result. 1840 /// \returns An initialized 128-bit floating-point vector of [4 x float]. 1841 static __inline__ __m128 __DEFAULT_FN_ATTRS 1842 _mm_setr_ps(float __z, float __y, float __x, float __w) 1843 { 1844 return (__m128){ __z, __y, __x, __w }; 1845 } 1846 1847 /// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized 1848 /// to zero. 1849 /// 1850 /// \headerfile <x86intrin.h> 1851 /// 1852 /// This intrinsic corresponds to the \c VXORPS / XORPS instruction. 1853 /// 1854 /// \returns An initialized 128-bit floating-point vector of [4 x float] with 1855 /// all elements set to zero. 1856 static __inline__ __m128 __DEFAULT_FN_ATTRS 1857 _mm_setzero_ps(void) 1858 { 1859 return (__m128){ 0, 0, 0, 0 }; 1860 } 1861 1862 /// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a 1863 /// memory location. 1864 /// 1865 /// \headerfile <x86intrin.h> 1866 /// 1867 /// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction. 1868 /// 1869 /// \param __p 1870 /// A pointer to a 64-bit memory location. 1871 /// \param __a 1872 /// A 128-bit vector of [4 x float] containing the values to be stored. 1873 static __inline__ void __DEFAULT_FN_ATTRS 1874 _mm_storeh_pi(__m64 *__p, __m128 __a) 1875 { 1876 __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a); 1877 } 1878 1879 /// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a 1880 /// memory location. 1881 /// 1882 /// \headerfile <x86intrin.h> 1883 /// 1884 /// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction. 1885 /// 1886 /// \param __p 1887 /// A pointer to a memory location that will receive the float values. 1888 /// \param __a 1889 /// A 128-bit vector of [4 x float] containing the values to be stored. 1890 static __inline__ void __DEFAULT_FN_ATTRS 1891 _mm_storel_pi(__m64 *__p, __m128 __a) 1892 { 1893 __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a); 1894 } 1895 1896 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a 1897 /// memory location. 1898 /// 1899 /// \headerfile <x86intrin.h> 1900 /// 1901 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction. 1902 /// 1903 /// \param __p 1904 /// A pointer to a 32-bit memory location. 1905 /// \param __a 1906 /// A 128-bit vector of [4 x float] containing the value to be stored. 1907 static __inline__ void __DEFAULT_FN_ATTRS 1908 _mm_store_ss(float *__p, __m128 __a) 1909 { 1910 struct __mm_store_ss_struct { 1911 float __u; 1912 } __attribute__((__packed__, __may_alias__)); 1913 ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 1914 } 1915 1916 /// \brief Stores float values from a 128-bit vector of [4 x float] to an 1917 /// unaligned memory location. 1918 /// 1919 /// \headerfile <x86intrin.h> 1920 /// 1921 /// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction. 1922 /// 1923 /// \param __p 1924 /// A pointer to a 128-bit memory location. The address of the memory 1925 /// location does not have to be aligned. 1926 /// \param __a 1927 /// A 128-bit vector of [4 x float] containing the values to be stored. 1928 static __inline__ void __DEFAULT_FN_ATTRS 1929 _mm_storeu_ps(float *__p, __m128 __a) 1930 { 1931 struct __storeu_ps { 1932 __m128 __v; 1933 } __attribute__((__packed__, __may_alias__)); 1934 ((struct __storeu_ps*)__p)->__v = __a; 1935 } 1936 1937 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into 1938 /// four contiguous elements in an aligned memory location. 1939 /// 1940 /// \headerfile <x86intrin.h> 1941 /// 1942 /// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling 1943 /// instruction. 1944 /// 1945 /// \param __p 1946 /// A pointer to a 128-bit memory location. 1947 /// \param __a 1948 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each 1949 /// of the four contiguous elements pointed by __p. 1950 static __inline__ void __DEFAULT_FN_ATTRS 1951 _mm_store_ps(float *__p, __m128 __a) 1952 { 1953 *(__m128*)__p = __a; 1954 } 1955 1956 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into 1957 /// four contiguous elements in an aligned memory location. 1958 /// 1959 /// \headerfile <x86intrin.h> 1960 /// 1961 /// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling 1962 /// instruction. 1963 /// 1964 /// \param __p 1965 /// A pointer to a 128-bit memory location. 1966 /// \param __a 1967 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each 1968 /// of the four contiguous elements pointed by __p. 1969 static __inline__ void __DEFAULT_FN_ATTRS 1970 _mm_store1_ps(float *__p, __m128 __a) 1971 { 1972 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); 1973 _mm_store_ps(__p, __a); 1974 } 1975 1976 /// \brief Stores float values from a 128-bit vector of [4 x float] to an 1977 /// aligned memory location. 1978 /// 1979 /// \headerfile <x86intrin.h> 1980 /// 1981 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction. 1982 /// 1983 /// \param __p 1984 /// A pointer to a 128-bit memory location. The address of the memory 1985 /// location has to be 128-bit aligned. 1986 /// \param __a 1987 /// A 128-bit vector of [4 x float] containing the values to be stored. 1988 static __inline__ void __DEFAULT_FN_ATTRS 1989 _mm_store_ps1(float *__p, __m128 __a) 1990 { 1991 return _mm_store1_ps(__p, __a); 1992 } 1993 1994 /// \brief Stores float values from a 128-bit vector of [4 x float] to an 1995 /// aligned memory location in reverse order. 1996 /// 1997 /// \headerfile <x86intrin.h> 1998 /// 1999 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling 2000 /// instruction. 2001 /// 2002 /// \param __p 2003 /// A pointer to a 128-bit memory location. The address of the memory 2004 /// location has to be 128-bit aligned. 2005 /// \param __a 2006 /// A 128-bit vector of [4 x float] containing the values to be stored. 2007 static __inline__ void __DEFAULT_FN_ATTRS 2008 _mm_storer_ps(float *__p, __m128 __a) 2009 { 2010 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 2011 _mm_store_ps(__p, __a); 2012 } 2013 2014 #define _MM_HINT_T0 3 2015 #define _MM_HINT_T1 2 2016 #define _MM_HINT_T2 1 2017 #define _MM_HINT_NTA 0 2018 2019 #ifndef _MSC_VER 2020 /* FIXME: We have to #define this because "sel" must be a constant integer, and 2021 Sema doesn't do any form of constant propagation yet. */ 2022 2023 /// \brief Loads one cache line of data from the specified address to a location 2024 /// closer to the processor. 2025 /// 2026 /// \headerfile <x86intrin.h> 2027 /// 2028 /// \code 2029 /// void _mm_prefetch(const void * a, const int sel); 2030 /// \endcode 2031 /// 2032 /// This intrinsic corresponds to the \c PREFETCHNTA instruction. 2033 /// 2034 /// \param a 2035 /// A pointer to a memory location containing a cache line of data. 2036 /// \param sel 2037 /// A predefined integer constant specifying the type of prefetch operation: 2038 /// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. 2039 /// The PREFETCHNTA instruction will be generated. 2040 /// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will 2041 /// be generated. 2042 /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will 2043 /// be generated. 2044 /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will 2045 /// be generated. 2046 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) 2047 #endif 2048 2049 /// \brief Stores a 64-bit integer in the specified aligned memory location. To 2050 /// minimize caching, the data is flagged as non-temporal (unlikely to be 2051 /// used again soon). 2052 /// 2053 /// \headerfile <x86intrin.h> 2054 /// 2055 /// This intrinsic corresponds to the \c MOVNTQ instruction. 2056 /// 2057 /// \param __p 2058 /// A pointer to an aligned memory location used to store the register value. 2059 /// \param __a 2060 /// A 64-bit integer containing the value to be stored. 2061 static __inline__ void __DEFAULT_FN_ATTRS 2062 _mm_stream_pi(__m64 *__p, __m64 __a) 2063 { 2064 __builtin_ia32_movntq(__p, __a); 2065 } 2066 2067 /// \brief Moves packed float values from a 128-bit vector of [4 x float] to a 2068 /// 128-bit aligned memory location. To minimize caching, the data is flagged 2069 /// as non-temporal (unlikely to be used again soon). 2070 /// 2071 /// \headerfile <x86intrin.h> 2072 /// 2073 /// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction. 2074 /// 2075 /// \param __p 2076 /// A pointer to a 128-bit aligned memory location that will receive the 2077 /// integer values. 2078 /// \param __a 2079 /// A 128-bit vector of [4 x float] containing the values to be moved. 2080 static __inline__ void __DEFAULT_FN_ATTRS 2081 _mm_stream_ps(float *__p, __m128 __a) 2082 { 2083 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); 2084 } 2085 2086 /// \brief Forces strong memory ordering (serialization) between store 2087 /// instructions preceding this instruction and store instructions following 2088 /// this instruction, ensuring the system completes all previous stores 2089 /// before executing subsequent stores. 2090 /// 2091 /// \headerfile <x86intrin.h> 2092 /// 2093 /// This intrinsic corresponds to the \c SFENCE instruction. 2094 /// 2095 static __inline__ void __DEFAULT_FN_ATTRS 2096 _mm_sfence(void) 2097 { 2098 __builtin_ia32_sfence(); 2099 } 2100 2101 /// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and 2102 /// returns it, as specified by the immediate integer operand. 2103 /// 2104 /// \headerfile <x86intrin.h> 2105 /// 2106 /// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction. 2107 /// 2108 /// \param __a 2109 /// A 64-bit vector of [4 x i16]. 2110 /// \param __n 2111 /// An immediate integer operand that determines which bits are extracted: 2112 /// 0: Bits [15:0] are copied to the destination. 2113 /// 1: Bits [31:16] are copied to the destination. 2114 /// 2: Bits [47:32] are copied to the destination. 2115 /// 3: Bits [63:48] are copied to the destination. 2116 /// \returns A 16-bit integer containing the extracted 16 bits of packed data. 2117 #define _mm_extract_pi16(a, n) __extension__ ({ \ 2118 (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); }) 2119 2120 /// \brief Copies data from the 64-bit vector of [4 x i16] to the destination, 2121 /// and inserts the lower 16-bits of an integer operand at the 16-bit offset 2122 /// specified by the immediate operand __n. 2123 /// 2124 /// \headerfile <x86intrin.h> 2125 /// 2126 /// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction. 2127 /// 2128 /// \param __a 2129 /// A 64-bit vector of [4 x i16]. 2130 /// \param __d 2131 /// An integer. The lower 16-bit value from this operand is written to the 2132 /// destination at the offset specified by operand __n. 2133 /// \param __n 2134 /// An immediate integer operant that determines which the bits to be used 2135 /// in the destination. 2136 /// 0: Bits [15:0] are copied to the destination. 2137 /// 1: Bits [31:16] are copied to the destination. 2138 /// 2: Bits [47:32] are copied to the destination. 2139 /// 3: Bits [63:48] are copied to the destination. 2140 /// The remaining bits in the destination are copied from the corresponding 2141 /// bits in operand __a. 2142 /// \returns A 64-bit integer vector containing the copied packed data from the 2143 /// operands. 2144 #define _mm_insert_pi16(a, d, n) __extension__ ({ \ 2145 (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); }) 2146 2147 /// \brief Compares each of the corresponding packed 16-bit integer values of 2148 /// the 64-bit integer vectors, and writes the greater value to the 2149 /// corresponding bits in the destination. 2150 /// 2151 /// \headerfile <x86intrin.h> 2152 /// 2153 /// This intrinsic corresponds to the \c PMAXSW instruction. 2154 /// 2155 /// \param __a 2156 /// A 64-bit integer vector containing one of the source operands. 2157 /// \param __b 2158 /// A 64-bit integer vector containing one of the source operands. 2159 /// \returns A 64-bit integer vector containing the comparison results. 2160 static __inline__ __m64 __DEFAULT_FN_ATTRS 2161 _mm_max_pi16(__m64 __a, __m64 __b) 2162 { 2163 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 2164 } 2165 2166 /// \brief Compares each of the corresponding packed 8-bit unsigned integer 2167 /// values of the 64-bit integer vectors, and writes the greater value to the 2168 /// corresponding bits in the destination. 2169 /// 2170 /// \headerfile <x86intrin.h> 2171 /// 2172 /// This intrinsic corresponds to the \c PMAXUB instruction. 2173 /// 2174 /// \param __a 2175 /// A 64-bit integer vector containing one of the source operands. 2176 /// \param __b 2177 /// A 64-bit integer vector containing one of the source operands. 2178 /// \returns A 64-bit integer vector containing the comparison results. 2179 static __inline__ __m64 __DEFAULT_FN_ATTRS 2180 _mm_max_pu8(__m64 __a, __m64 __b) 2181 { 2182 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 2183 } 2184 2185 /// \brief Compares each of the corresponding packed 16-bit integer values of 2186 /// the 64-bit integer vectors, and writes the lesser value to the 2187 /// corresponding bits in the destination. 2188 /// 2189 /// \headerfile <x86intrin.h> 2190 /// 2191 /// This intrinsic corresponds to the \c PMINSW instruction. 2192 /// 2193 /// \param __a 2194 /// A 64-bit integer vector containing one of the source operands. 2195 /// \param __b 2196 /// A 64-bit integer vector containing one of the source operands. 2197 /// \returns A 64-bit integer vector containing the comparison results. 2198 static __inline__ __m64 __DEFAULT_FN_ATTRS 2199 _mm_min_pi16(__m64 __a, __m64 __b) 2200 { 2201 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 2202 } 2203 2204 /// \brief Compares each of the corresponding packed 8-bit unsigned integer 2205 /// values of the 64-bit integer vectors, and writes the lesser value to the 2206 /// corresponding bits in the destination. 2207 /// 2208 /// \headerfile <x86intrin.h> 2209 /// 2210 /// This intrinsic corresponds to the \c PMINUB instruction. 2211 /// 2212 /// \param __a 2213 /// A 64-bit integer vector containing one of the source operands. 2214 /// \param __b 2215 /// A 64-bit integer vector containing one of the source operands. 2216 /// \returns A 64-bit integer vector containing the comparison results. 2217 static __inline__ __m64 __DEFAULT_FN_ATTRS 2218 _mm_min_pu8(__m64 __a, __m64 __b) 2219 { 2220 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 2221 } 2222 2223 /// \brief Takes the most significant bit from each 8-bit element in a 64-bit 2224 /// integer vector to create a 16-bit mask value. Zero-extends the value to 2225 /// 32-bit integer and writes it to the destination. 2226 /// 2227 /// \headerfile <x86intrin.h> 2228 /// 2229 /// This intrinsic corresponds to the \c PMOVMSKB instruction. 2230 /// 2231 /// \param __a 2232 /// A 64-bit integer vector containing the values with bits to be extracted. 2233 /// \returns The most significant bit from each 8-bit element in the operand, 2234 /// written to bits [15:0]. 2235 static __inline__ int __DEFAULT_FN_ATTRS 2236 _mm_movemask_pi8(__m64 __a) 2237 { 2238 return __builtin_ia32_pmovmskb((__v8qi)__a); 2239 } 2240 2241 /// \brief Multiplies packed 16-bit unsigned integer values and writes the 2242 /// high-order 16 bits of each 32-bit product to the corresponding bits in 2243 /// the destination. 2244 /// 2245 /// \headerfile <x86intrin.h> 2246 /// 2247 /// This intrinsic corresponds to the \c PMULHUW instruction. 2248 /// 2249 /// \param __a 2250 /// A 64-bit integer vector containing one of the source operands. 2251 /// \param __b 2252 /// A 64-bit integer vector containing one of the source operands. 2253 /// \returns A 64-bit integer vector containing the products of both operands. 2254 static __inline__ __m64 __DEFAULT_FN_ATTRS 2255 _mm_mulhi_pu16(__m64 __a, __m64 __b) 2256 { 2257 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 2258 } 2259 2260 /// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the 2261 /// destination, as specified by the immediate value operand. 2262 /// 2263 /// \headerfile <x86intrin.h> 2264 /// 2265 /// This intrinsic corresponds to the \c PSHUFW instruction. 2266 /// 2267 /// \code 2268 /// __m64 _mm_shuffle_pi16(__m64 a, const int n); 2269 /// \endcode 2270 /// 2271 /// \param a 2272 /// A 64-bit integer vector containing the values to be shuffled. 2273 /// \param n 2274 /// An immediate value containing an 8-bit value specifying which elements to 2275 /// copy from a. The destinations within the 64-bit destination are assigned 2276 /// values as follows: 2277 /// Bits [1:0] are used to assign values to bits [15:0] in the destination. 2278 /// Bits [3:2] are used to assign values to bits [31:16] in the destination. 2279 /// Bits [5:4] are used to assign values to bits [47:32] in the destination. 2280 /// Bits [7:6] are used to assign values to bits [63:48] in the destination. 2281 /// Bit value assignments: 2282 /// 00: assigned from bits [15:0] of a. 2283 /// 01: assigned from bits [31:16] of a. 2284 /// 10: assigned from bits [47:32] of a. 2285 /// 11: assigned from bits [63:48] of a. 2286 /// \returns A 64-bit integer vector containing the shuffled values. 2287 #define _mm_shuffle_pi16(a, n) __extension__ ({ \ 2288 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); }) 2289 2290 /// \brief Conditionally copies the values from each 8-bit element in the first 2291 /// 64-bit integer vector operand to the specified memory location, as 2292 /// specified by the most significant bit in the corresponding element in the 2293 /// second 64-bit integer vector operand. To minimize caching, the data is 2294 /// flagged as non-temporal (unlikely to be used again soon). 2295 /// 2296 /// \headerfile <x86intrin.h> 2297 /// 2298 /// This intrinsic corresponds to the \c MASKMOVQ instruction. 2299 /// 2300 /// \param __d 2301 /// A 64-bit integer vector containing the values with elements to be copied. 2302 /// \param __n 2303 /// A 64-bit integer vector operand. The most significant bit from each 8-bit 2304 /// element determines whether the corresponding element in operand __d is 2305 /// copied. If the most significant bit of a given element is 1, the 2306 /// corresponding element in operand __d is copied. 2307 /// \param __p 2308 /// A pointer to a 64-bit memory location that will receive the conditionally 2309 /// copied integer values. The address of the memory location does not have 2310 /// to be aligned. 2311 static __inline__ void __DEFAULT_FN_ATTRS 2312 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 2313 { 2314 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 2315 } 2316 2317 /// \brief Computes the rounded averages of the packed unsigned 8-bit integer 2318 /// values and writes the averages to the corresponding bits in the 2319 /// destination. 2320 /// 2321 /// \headerfile <x86intrin.h> 2322 /// 2323 /// This intrinsic corresponds to the \c PAVGB instruction. 2324 /// 2325 /// \param __a 2326 /// A 64-bit integer vector containing one of the source operands. 2327 /// \param __b 2328 /// A 64-bit integer vector containing one of the source operands. 2329 /// \returns A 64-bit integer vector containing the averages of both operands. 2330 static __inline__ __m64 __DEFAULT_FN_ATTRS 2331 _mm_avg_pu8(__m64 __a, __m64 __b) 2332 { 2333 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 2334 } 2335 2336 /// \brief Computes the rounded averages of the packed unsigned 16-bit integer 2337 /// values and writes the averages to the corresponding bits in the 2338 /// destination. 2339 /// 2340 /// \headerfile <x86intrin.h> 2341 /// 2342 /// This intrinsic corresponds to the \c PAVGW instruction. 2343 /// 2344 /// \param __a 2345 /// A 64-bit integer vector containing one of the source operands. 2346 /// \param __b 2347 /// A 64-bit integer vector containing one of the source operands. 2348 /// \returns A 64-bit integer vector containing the averages of both operands. 2349 static __inline__ __m64 __DEFAULT_FN_ATTRS 2350 _mm_avg_pu16(__m64 __a, __m64 __b) 2351 { 2352 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 2353 } 2354 2355 /// \brief Subtracts the corresponding 8-bit unsigned integer values of the two 2356 /// 64-bit vector operands and computes the absolute value for each of the 2357 /// difference. Then sum of the 8 absolute differences is written to the 2358 /// bits [15:0] of the destination; the remaining bits [63:16] are cleared. 2359 /// 2360 /// \headerfile <x86intrin.h> 2361 /// 2362 /// This intrinsic corresponds to the \c PSADBW instruction. 2363 /// 2364 /// \param __a 2365 /// A 64-bit integer vector containing one of the source operands. 2366 /// \param __b 2367 /// A 64-bit integer vector containing one of the source operands. 2368 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the 2369 /// sets of absolute differences between both operands. The upper bits are 2370 /// cleared. 2371 static __inline__ __m64 __DEFAULT_FN_ATTRS 2372 _mm_sad_pu8(__m64 __a, __m64 __b) 2373 { 2374 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 2375 } 2376 2377 /// \brief Returns the contents of the MXCSR register as a 32-bit unsigned 2378 /// integer value. There are several groups of macros associated with this 2379 /// intrinsic, including: 2380 /// * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, 2381 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, 2382 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper 2383 /// _MM_GET_EXCEPTION_STATE(). 2384 /// * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, 2385 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. 2386 /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK(). 2387 /// * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, 2388 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper 2389 /// _MM_GET_ROUNDING_MODE(x) where x is one of these macros. 2390 /// * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. 2391 /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE(). 2392 /// * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, 2393 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper 2394 /// _MM_GET_DENORMALS_ZERO_MODE(). 2395 /// 2396 /// For example, the expression below checks if an overflow exception has 2397 /// occurred: 2398 /// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW ) 2399 /// 2400 /// The following example gets the current rounding mode: 2401 /// _MM_GET_ROUNDING_MODE() 2402 /// 2403 /// \headerfile <x86intrin.h> 2404 /// 2405 /// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction. 2406 /// 2407 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR 2408 /// register. 2409 static __inline__ unsigned int __DEFAULT_FN_ATTRS 2410 _mm_getcsr(void) 2411 { 2412 return __builtin_ia32_stmxcsr(); 2413 } 2414 2415 /// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There 2416 /// are several groups of macros associated with this intrinsic, including: 2417 /// * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, 2418 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, 2419 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper 2420 /// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros. 2421 /// * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, 2422 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. 2423 /// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one 2424 /// of these macros. 2425 /// * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, 2426 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper 2427 /// _MM_SET_ROUNDING_MODE(x) where x is one of these macros. 2428 /// * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. 2429 /// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is 2430 /// one of these macros. 2431 /// * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, 2432 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper 2433 /// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros. 2434 /// 2435 /// For example, the following expression causes subsequent floating-point 2436 /// operations to round up: 2437 /// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP) 2438 /// 2439 /// The following example sets the DAZ and FTZ flags: 2440 /// void setFlags() { 2441 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) 2442 /// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON) 2443 /// } 2444 /// 2445 /// \headerfile <x86intrin.h> 2446 /// 2447 /// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction. 2448 /// 2449 /// \param __i 2450 /// A 32-bit unsigned integer value to be written to the MXCSR register. 2451 static __inline__ void __DEFAULT_FN_ATTRS 2452 _mm_setcsr(unsigned int __i) 2453 { 2454 __builtin_ia32_ldmxcsr(__i); 2455 } 2456 2457 /// \brief Selects 4 float values from the 128-bit operands of [4 x float], as 2458 /// specified by the immediate value operand. 2459 /// 2460 /// \headerfile <x86intrin.h> 2461 /// 2462 /// \code 2463 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask); 2464 /// \endcode 2465 /// 2466 /// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction. 2467 /// 2468 /// \param a 2469 /// A 128-bit vector of [4 x float]. 2470 /// \param b 2471 /// A 128-bit vector of [4 x float]. 2472 /// \param mask 2473 /// An immediate value containing an 8-bit value specifying which elements to 2474 /// copy from a and b. 2475 /// Bits [3:0] specify the values copied from operand a. 2476 /// Bits [7:4] specify the values copied from operand b. The destinations 2477 /// within the 128-bit destination are assigned values as follows: 2478 /// Bits [1:0] are used to assign values to bits [31:0] in the destination. 2479 /// Bits [3:2] are used to assign values to bits [63:32] in the destination. 2480 /// Bits [5:4] are used to assign values to bits [95:64] in the destination. 2481 /// Bits [7:6] are used to assign values to bits [127:96] in the destination. 2482 /// Bit value assignments: 2483 /// 00: Bits [31:0] copied from the specified operand. 2484 /// 01: Bits [63:32] copied from the specified operand. 2485 /// 10: Bits [95:64] copied from the specified operand. 2486 /// 11: Bits [127:96] copied from the specified operand. 2487 /// \returns A 128-bit vector of [4 x float] containing the shuffled values. 2488 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ 2489 (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ 2490 0 + (((mask) >> 0) & 0x3), \ 2491 0 + (((mask) >> 2) & 0x3), \ 2492 4 + (((mask) >> 4) & 0x3), \ 2493 4 + (((mask) >> 6) & 0x3)); }) 2494 2495 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of 2496 /// [4 x float] and interleaves them into a 128-bit vector of [4 x 2497 /// float]. 2498 /// 2499 /// \headerfile <x86intrin.h> 2500 /// 2501 /// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction. 2502 /// 2503 /// \param __a 2504 /// A 128-bit vector of [4 x float]. 2505 /// Bits [95:64] are written to bits [31:0] of the destination. 2506 /// Bits [127:96] are written to bits [95:64] of the destination. 2507 /// \param __b 2508 /// A 128-bit vector of [4 x float]. 2509 /// Bits [95:64] are written to bits [63:32] of the destination. 2510 /// Bits [127:96] are written to bits [127:96] of the destination. 2511 /// \returns A 128-bit vector of [4 x float] containing the interleaved values. 2512 static __inline__ __m128 __DEFAULT_FN_ATTRS 2513 _mm_unpackhi_ps(__m128 __a, __m128 __b) 2514 { 2515 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); 2516 } 2517 2518 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of 2519 /// [4 x float] and interleaves them into a 128-bit vector of [4 x 2520 /// float]. 2521 /// 2522 /// \headerfile <x86intrin.h> 2523 /// 2524 /// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction. 2525 /// 2526 /// \param __a 2527 /// A 128-bit vector of [4 x float]. 2528 /// Bits [31:0] are written to bits [31:0] of the destination. 2529 /// Bits [63:32] are written to bits [95:64] of the destination. 2530 /// \param __b 2531 /// A 128-bit vector of [4 x float]. 2532 /// Bits [31:0] are written to bits [63:32] of the destination. 2533 /// Bits [63:32] are written to bits [127:96] of the destination. 2534 /// \returns A 128-bit vector of [4 x float] containing the interleaved values. 2535 static __inline__ __m128 __DEFAULT_FN_ATTRS 2536 _mm_unpacklo_ps(__m128 __a, __m128 __b) 2537 { 2538 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); 2539 } 2540 2541 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower 2542 /// 32 bits are set to the lower 32 bits of the second parameter. The upper 2543 /// 96 bits are set to the upper 96 bits of the first parameter. 2544 /// 2545 /// \headerfile <x86intrin.h> 2546 /// 2547 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction. 2548 /// 2549 /// \param __a 2550 /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are 2551 /// written to the upper 96 bits of the result. 2552 /// \param __b 2553 /// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are 2554 /// written to the lower 32 bits of the result. 2555 /// \returns A 128-bit floating-point vector of [4 x float]. 2556 static __inline__ __m128 __DEFAULT_FN_ATTRS 2557 _mm_move_ss(__m128 __a, __m128 __b) 2558 { 2559 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3); 2560 } 2561 2562 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower 2563 /// 64 bits are set to the upper 64 bits of the second parameter. The upper 2564 /// 64 bits are set to the upper 64 bits of the first parameter. 2565 /// 2566 /// \headerfile <x86intrin.h> 2567 /// 2568 /// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction. 2569 /// 2570 /// \param __a 2571 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are 2572 /// written to the upper 64 bits of the result. 2573 /// \param __b 2574 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are 2575 /// written to the lower 64 bits of the result. 2576 /// \returns A 128-bit floating-point vector of [4 x float]. 2577 static __inline__ __m128 __DEFAULT_FN_ATTRS 2578 _mm_movehl_ps(__m128 __a, __m128 __b) 2579 { 2580 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); 2581 } 2582 2583 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower 2584 /// 64 bits are set to the lower 64 bits of the first parameter. The upper 2585 /// 64 bits are set to the lower 64 bits of the second parameter. 2586 /// 2587 /// \headerfile <x86intrin.h> 2588 /// 2589 /// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction. 2590 /// 2591 /// \param __a 2592 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are 2593 /// written to the lower 64 bits of the result. 2594 /// \param __b 2595 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are 2596 /// written to the upper 64 bits of the result. 2597 /// \returns A 128-bit floating-point vector of [4 x float]. 2598 static __inline__ __m128 __DEFAULT_FN_ATTRS 2599 _mm_movelh_ps(__m128 __a, __m128 __b) 2600 { 2601 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); 2602 } 2603 2604 /// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x 2605 /// float]. 2606 /// 2607 /// \headerfile <x86intrin.h> 2608 /// 2609 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction. 2610 /// 2611 /// \param __a 2612 /// A 64-bit vector of [4 x i16]. The elements of the destination are copied 2613 /// from the corresponding elements in this operand. 2614 /// \returns A 128-bit vector of [4 x float] containing the copied and converted 2615 /// values from the operand. 2616 static __inline__ __m128 __DEFAULT_FN_ATTRS 2617 _mm_cvtpi16_ps(__m64 __a) 2618 { 2619 __m64 __b, __c; 2620 __m128 __r; 2621 2622 __b = _mm_setzero_si64(); 2623 __b = _mm_cmpgt_pi16(__b, __a); 2624 __c = _mm_unpackhi_pi16(__a, __b); 2625 __r = _mm_setzero_ps(); 2626 __r = _mm_cvtpi32_ps(__r, __c); 2627 __r = _mm_movelh_ps(__r, __r); 2628 __c = _mm_unpacklo_pi16(__a, __b); 2629 __r = _mm_cvtpi32_ps(__r, __c); 2630 2631 return __r; 2632 } 2633 2634 /// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a 2635 /// 128-bit vector of [4 x float]. 2636 /// 2637 /// \headerfile <x86intrin.h> 2638 /// 2639 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction. 2640 /// 2641 /// \param __a 2642 /// A 64-bit vector of 16-bit unsigned integer values. The elements of the 2643 /// destination are copied from the corresponding elements in this operand. 2644 /// \returns A 128-bit vector of [4 x float] containing the copied and converted 2645 /// values from the operand. 2646 static __inline__ __m128 __DEFAULT_FN_ATTRS 2647 _mm_cvtpu16_ps(__m64 __a) 2648 { 2649 __m64 __b, __c; 2650 __m128 __r; 2651 2652 __b = _mm_setzero_si64(); 2653 __c = _mm_unpackhi_pi16(__a, __b); 2654 __r = _mm_setzero_ps(); 2655 __r = _mm_cvtpi32_ps(__r, __c); 2656 __r = _mm_movelh_ps(__r, __r); 2657 __c = _mm_unpacklo_pi16(__a, __b); 2658 __r = _mm_cvtpi32_ps(__r, __c); 2659 2660 return __r; 2661 } 2662 2663 /// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] 2664 /// into a 128-bit vector of [4 x float]. 2665 /// 2666 /// \headerfile <x86intrin.h> 2667 /// 2668 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction. 2669 /// 2670 /// \param __a 2671 /// A 64-bit vector of [8 x i8]. The elements of the destination are copied 2672 /// from the corresponding lower 4 elements in this operand. 2673 /// \returns A 128-bit vector of [4 x float] containing the copied and converted 2674 /// values from the operand. 2675 static __inline__ __m128 __DEFAULT_FN_ATTRS 2676 _mm_cvtpi8_ps(__m64 __a) 2677 { 2678 __m64 __b; 2679 2680 __b = _mm_setzero_si64(); 2681 __b = _mm_cmpgt_pi8(__b, __a); 2682 __b = _mm_unpacklo_pi8(__a, __b); 2683 2684 return _mm_cvtpi16_ps(__b); 2685 } 2686 2687 /// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit 2688 /// vector of [8 x u8] into a 128-bit vector of [4 x float]. 2689 /// 2690 /// \headerfile <x86intrin.h> 2691 /// 2692 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction. 2693 /// 2694 /// \param __a 2695 /// A 64-bit vector of unsigned 8-bit integer values. The elements of the 2696 /// destination are copied from the corresponding lower 4 elements in this 2697 /// operand. 2698 /// \returns A 128-bit vector of [4 x float] containing the copied and converted 2699 /// values from the source operand. 2700 static __inline__ __m128 __DEFAULT_FN_ATTRS 2701 _mm_cvtpu8_ps(__m64 __a) 2702 { 2703 __m64 __b; 2704 2705 __b = _mm_setzero_si64(); 2706 __b = _mm_unpacklo_pi8(__a, __b); 2707 2708 return _mm_cvtpi16_ps(__b); 2709 } 2710 2711 /// \brief Converts the two 32-bit signed integer values from each 64-bit vector 2712 /// operand of [2 x i32] into a 128-bit vector of [4 x float]. 2713 /// 2714 /// \headerfile <x86intrin.h> 2715 /// 2716 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction. 2717 /// 2718 /// \param __a 2719 /// A 64-bit vector of [2 x i32]. The lower elements of the destination are 2720 /// copied from the elements in this operand. 2721 /// \param __b 2722 /// A 64-bit vector of [2 x i32]. The upper elements of the destination are 2723 /// copied from the elements in this operand. 2724 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 2725 /// copied and converted values from the first operand. The upper 64 bits 2726 /// contain the copied and converted values from the second operand. 2727 static __inline__ __m128 __DEFAULT_FN_ATTRS 2728 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 2729 { 2730 __m128 __c; 2731 2732 __c = _mm_setzero_ps(); 2733 __c = _mm_cvtpi32_ps(__c, __b); 2734 __c = _mm_movelh_ps(__c, __c); 2735 2736 return _mm_cvtpi32_ps(__c, __a); 2737 } 2738 2739 /// \brief Converts each single-precision floating-point element of a 128-bit 2740 /// floating-point vector of [4 x float] into a 16-bit signed integer, and 2741 /// packs the results into a 64-bit integer vector of [4 x i16]. If the 2742 /// floating-point element is NaN or infinity, or if the floating-point 2743 /// element is greater than 0x7FFFFFFF or less than -0x8000, it is converted 2744 /// to 0x8000. Otherwise if the floating-point element is greater 2745 /// than 0x7FFF, it is converted to 0x7FFF. 2746 /// 2747 /// \headerfile <x86intrin.h> 2748 /// 2749 /// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction. 2750 /// 2751 /// \param __a 2752 /// A 128-bit floating-point vector of [4 x float]. 2753 /// \returns A 64-bit integer vector of [4 x i16] containing the converted 2754 /// values. 2755 static __inline__ __m64 __DEFAULT_FN_ATTRS 2756 _mm_cvtps_pi16(__m128 __a) 2757 { 2758 __m64 __b, __c; 2759 2760 __b = _mm_cvtps_pi32(__a); 2761 __a = _mm_movehl_ps(__a, __a); 2762 __c = _mm_cvtps_pi32(__a); 2763 2764 return _mm_packs_pi32(__b, __c); 2765 } 2766 2767 /// \brief Converts each single-precision floating-point element of a 128-bit 2768 /// floating-point vector of [4 x float] into an 8-bit signed integer, and 2769 /// packs the results into the lower 32 bits of a 64-bit integer vector of 2770 /// [8 x i8]. The upper 32 bits of the vector are set to 0. If the 2771 /// floating-point element is NaN or infinity, or if the floating-point 2772 /// element is greater than 0x7FFFFFFF or less than -0x80, it is converted 2773 /// to 0x80. Otherwise if the floating-point element is greater 2774 /// than 0x7F, it is converted to 0x7F. 2775 /// 2776 /// \headerfile <x86intrin.h> 2777 /// 2778 /// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction. 2779 /// 2780 /// \param __a 2781 /// 128-bit floating-point vector of [4 x float]. 2782 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the 2783 /// converted values and the uppper 32 bits are set to zero. 2784 static __inline__ __m64 __DEFAULT_FN_ATTRS 2785 _mm_cvtps_pi8(__m128 __a) 2786 { 2787 __m64 __b, __c; 2788 2789 __b = _mm_cvtps_pi16(__a); 2790 __c = _mm_setzero_si64(); 2791 2792 return _mm_packs_pi16(__b, __c); 2793 } 2794 2795 /// \brief Extracts the sign bits from each single-precision floating-point 2796 /// element of a 128-bit floating-point vector of [4 x float] and returns the 2797 /// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set 2798 /// to zero. 2799 /// 2800 /// \headerfile <x86intrin.h> 2801 /// 2802 /// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction. 2803 /// 2804 /// \param __a 2805 /// A 128-bit floating-point vector of [4 x float]. 2806 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each 2807 /// single-precision floating-point element of the parameter. Bits [31:4] are 2808 /// set to zero. 2809 static __inline__ int __DEFAULT_FN_ATTRS 2810 _mm_movemask_ps(__m128 __a) 2811 { 2812 return __builtin_ia32_movmskps((__v4sf)__a); 2813 } 2814 2815 2816 #define _MM_ALIGN16 __attribute__((aligned(16))) 2817 2818 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 2819 2820 #define _MM_EXCEPT_INVALID (0x0001) 2821 #define _MM_EXCEPT_DENORM (0x0002) 2822 #define _MM_EXCEPT_DIV_ZERO (0x0004) 2823 #define _MM_EXCEPT_OVERFLOW (0x0008) 2824 #define _MM_EXCEPT_UNDERFLOW (0x0010) 2825 #define _MM_EXCEPT_INEXACT (0x0020) 2826 #define _MM_EXCEPT_MASK (0x003f) 2827 2828 #define _MM_MASK_INVALID (0x0080) 2829 #define _MM_MASK_DENORM (0x0100) 2830 #define _MM_MASK_DIV_ZERO (0x0200) 2831 #define _MM_MASK_OVERFLOW (0x0400) 2832 #define _MM_MASK_UNDERFLOW (0x0800) 2833 #define _MM_MASK_INEXACT (0x1000) 2834 #define _MM_MASK_MASK (0x1f80) 2835 2836 #define _MM_ROUND_NEAREST (0x0000) 2837 #define _MM_ROUND_DOWN (0x2000) 2838 #define _MM_ROUND_UP (0x4000) 2839 #define _MM_ROUND_TOWARD_ZERO (0x6000) 2840 #define _MM_ROUND_MASK (0x6000) 2841 2842 #define _MM_FLUSH_ZERO_MASK (0x8000) 2843 #define _MM_FLUSH_ZERO_ON (0x8000) 2844 #define _MM_FLUSH_ZERO_OFF (0x0000) 2845 2846 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 2847 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 2848 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 2849 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 2850 2851 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 2852 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 2853 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 2854 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 2855 2856 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 2857 do { \ 2858 __m128 tmp3, tmp2, tmp1, tmp0; \ 2859 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 2860 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 2861 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 2862 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 2863 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 2864 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 2865 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 2866 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 2867 } while (0) 2868 2869 /* Aliases for compatibility. */ 2870 #define _m_pextrw _mm_extract_pi16 2871 #define _m_pinsrw _mm_insert_pi16 2872 #define _m_pmaxsw _mm_max_pi16 2873 #define _m_pmaxub _mm_max_pu8 2874 #define _m_pminsw _mm_min_pi16 2875 #define _m_pminub _mm_min_pu8 2876 #define _m_pmovmskb _mm_movemask_pi8 2877 #define _m_pmulhuw _mm_mulhi_pu16 2878 #define _m_pshufw _mm_shuffle_pi16 2879 #define _m_maskmovq _mm_maskmove_si64 2880 #define _m_pavgb _mm_avg_pu8 2881 #define _m_pavgw _mm_avg_pu16 2882 #define _m_psadbw _mm_sad_pu8 2883 #define _m_ _mm_ 2884 #define _m_ _mm_ 2885 2886 #undef __DEFAULT_FN_ATTRS 2887 2888 /* Ugly hack for backwards-compatibility (compatible with gcc) */ 2889 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) 2890 #include <emmintrin.h> 2891 #endif 2892 2893 #endif /* __XMMINTRIN_H */ 2894