1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __IMMINTRIN_H 25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26 #endif 27 28 #ifndef __AVXINTRIN_H 29 #define __AVXINTRIN_H 30 31 typedef double __v4df __attribute__ ((__vector_size__ (32))); 32 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 33 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 34 typedef int __v8si __attribute__ ((__vector_size__ (32))); 35 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 36 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 37 38 /* Unsigned types */ 39 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 40 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 41 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 42 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 43 44 /* We need an explicitly signed variant for char. Note that this shouldn't 45 * appear in the interface though. */ 46 typedef signed char __v32qs __attribute__((__vector_size__(32))); 47 48 typedef float __m256 __attribute__ ((__vector_size__ (32))); 49 typedef double __m256d __attribute__((__vector_size__(32))); 50 typedef long long __m256i __attribute__((__vector_size__(32))); 51 52 /* Define the default attributes for the functions in this file. */ 53 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 54 55 /* Arithmetic */ 56 /// \brief Adds two 256-bit vectors of [4 x double]. 57 /// 58 /// \headerfile <x86intrin.h> 59 /// 60 /// This intrinsic corresponds to the <c> VADDPD </c> instruction. 61 /// 62 /// \param __a 63 /// A 256-bit vector of [4 x double] containing one of the source operands. 64 /// \param __b 65 /// A 256-bit vector of [4 x double] containing one of the source operands. 66 /// \returns A 256-bit vector of [4 x double] containing the sums of both 67 /// operands. 68 static __inline __m256d __DEFAULT_FN_ATTRS 69 _mm256_add_pd(__m256d __a, __m256d __b) 70 { 71 return (__m256d)((__v4df)__a+(__v4df)__b); 72 } 73 74 /// \brief Adds two 256-bit vectors of [8 x float]. 75 /// 76 /// \headerfile <x86intrin.h> 77 /// 78 /// This intrinsic corresponds to the <c> VADDPS </c> instruction. 79 /// 80 /// \param __a 81 /// A 256-bit vector of [8 x float] containing one of the source operands. 82 /// \param __b 83 /// A 256-bit vector of [8 x float] containing one of the source operands. 84 /// \returns A 256-bit vector of [8 x float] containing the sums of both 85 /// operands. 86 static __inline __m256 __DEFAULT_FN_ATTRS 87 _mm256_add_ps(__m256 __a, __m256 __b) 88 { 89 return (__m256)((__v8sf)__a+(__v8sf)__b); 90 } 91 92 /// \brief Subtracts two 256-bit vectors of [4 x double]. 93 /// 94 /// \headerfile <x86intrin.h> 95 /// 96 /// This intrinsic corresponds to the <c> VSUBPD </c> instruction. 97 /// 98 /// \param __a 99 /// A 256-bit vector of [4 x double] containing the minuend. 100 /// \param __b 101 /// A 256-bit vector of [4 x double] containing the subtrahend. 102 /// \returns A 256-bit vector of [4 x double] containing the differences between 103 /// both operands. 104 static __inline __m256d __DEFAULT_FN_ATTRS 105 _mm256_sub_pd(__m256d __a, __m256d __b) 106 { 107 return (__m256d)((__v4df)__a-(__v4df)__b); 108 } 109 110 /// \brief Subtracts two 256-bit vectors of [8 x float]. 111 /// 112 /// \headerfile <x86intrin.h> 113 /// 114 /// This intrinsic corresponds to the <c> VSUBPS </c> instruction. 115 /// 116 /// \param __a 117 /// A 256-bit vector of [8 x float] containing the minuend. 118 /// \param __b 119 /// A 256-bit vector of [8 x float] containing the subtrahend. 120 /// \returns A 256-bit vector of [8 x float] containing the differences between 121 /// both operands. 122 static __inline __m256 __DEFAULT_FN_ATTRS 123 _mm256_sub_ps(__m256 __a, __m256 __b) 124 { 125 return (__m256)((__v8sf)__a-(__v8sf)__b); 126 } 127 128 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 129 /// two 256-bit vectors of [4 x double]. 130 /// 131 /// \headerfile <x86intrin.h> 132 /// 133 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 134 /// 135 /// \param __a 136 /// A 256-bit vector of [4 x double] containing the left source operand. 137 /// \param __b 138 /// A 256-bit vector of [4 x double] containing the right source operand. 139 /// \returns A 256-bit vector of [4 x double] containing the alternating sums 140 /// and differences between both operands. 141 static __inline __m256d __DEFAULT_FN_ATTRS 142 _mm256_addsub_pd(__m256d __a, __m256d __b) 143 { 144 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 145 } 146 147 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 148 /// two 256-bit vectors of [8 x float]. 149 /// 150 /// \headerfile <x86intrin.h> 151 /// 152 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 153 /// 154 /// \param __a 155 /// A 256-bit vector of [8 x float] containing the left source operand. 156 /// \param __b 157 /// A 256-bit vector of [8 x float] containing the right source operand. 158 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and 159 /// differences between both operands. 160 static __inline __m256 __DEFAULT_FN_ATTRS 161 _mm256_addsub_ps(__m256 __a, __m256 __b) 162 { 163 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 164 } 165 166 /// \brief Divides two 256-bit vectors of [4 x double]. 167 /// 168 /// \headerfile <x86intrin.h> 169 /// 170 /// This intrinsic corresponds to the <c> VDIVPD </c> instruction. 171 /// 172 /// \param __a 173 /// A 256-bit vector of [4 x double] containing the dividend. 174 /// \param __b 175 /// A 256-bit vector of [4 x double] containing the divisor. 176 /// \returns A 256-bit vector of [4 x double] containing the quotients of both 177 /// operands. 178 static __inline __m256d __DEFAULT_FN_ATTRS 179 _mm256_div_pd(__m256d __a, __m256d __b) 180 { 181 return (__m256d)((__v4df)__a/(__v4df)__b); 182 } 183 184 /// \brief Divides two 256-bit vectors of [8 x float]. 185 /// 186 /// \headerfile <x86intrin.h> 187 /// 188 /// This intrinsic corresponds to the <c> VDIVPS </c> instruction. 189 /// 190 /// \param __a 191 /// A 256-bit vector of [8 x float] containing the dividend. 192 /// \param __b 193 /// A 256-bit vector of [8 x float] containing the divisor. 194 /// \returns A 256-bit vector of [8 x float] containing the quotients of both 195 /// operands. 196 static __inline __m256 __DEFAULT_FN_ATTRS 197 _mm256_div_ps(__m256 __a, __m256 __b) 198 { 199 return (__m256)((__v8sf)__a/(__v8sf)__b); 200 } 201 202 /// \brief Compares two 256-bit vectors of [4 x double] and returns the greater 203 /// of each pair of values. 204 /// 205 /// \headerfile <x86intrin.h> 206 /// 207 /// This intrinsic corresponds to the <c> VMAXPD </c> instruction. 208 /// 209 /// \param __a 210 /// A 256-bit vector of [4 x double] containing one of the operands. 211 /// \param __b 212 /// A 256-bit vector of [4 x double] containing one of the operands. 213 /// \returns A 256-bit vector of [4 x double] containing the maximum values 214 /// between both operands. 215 static __inline __m256d __DEFAULT_FN_ATTRS 216 _mm256_max_pd(__m256d __a, __m256d __b) 217 { 218 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 219 } 220 221 /// \brief Compares two 256-bit vectors of [8 x float] and returns the greater 222 /// of each pair of values. 223 /// 224 /// \headerfile <x86intrin.h> 225 /// 226 /// This intrinsic corresponds to the <c> VMAXPS </c> instruction. 227 /// 228 /// \param __a 229 /// A 256-bit vector of [8 x float] containing one of the operands. 230 /// \param __b 231 /// A 256-bit vector of [8 x float] containing one of the operands. 232 /// \returns A 256-bit vector of [8 x float] containing the maximum values 233 /// between both operands. 234 static __inline __m256 __DEFAULT_FN_ATTRS 235 _mm256_max_ps(__m256 __a, __m256 __b) 236 { 237 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 238 } 239 240 /// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser 241 /// of each pair of values. 242 /// 243 /// \headerfile <x86intrin.h> 244 /// 245 /// This intrinsic corresponds to the <c> VMINPD </c> instruction. 246 /// 247 /// \param __a 248 /// A 256-bit vector of [4 x double] containing one of the operands. 249 /// \param __b 250 /// A 256-bit vector of [4 x double] containing one of the operands. 251 /// \returns A 256-bit vector of [4 x double] containing the minimum values 252 /// between both operands. 253 static __inline __m256d __DEFAULT_FN_ATTRS 254 _mm256_min_pd(__m256d __a, __m256d __b) 255 { 256 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 257 } 258 259 /// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser 260 /// of each pair of values. 261 /// 262 /// \headerfile <x86intrin.h> 263 /// 264 /// This intrinsic corresponds to the <c> VMINPS </c> instruction. 265 /// 266 /// \param __a 267 /// A 256-bit vector of [8 x float] containing one of the operands. 268 /// \param __b 269 /// A 256-bit vector of [8 x float] containing one of the operands. 270 /// \returns A 256-bit vector of [8 x float] containing the minimum values 271 /// between both operands. 272 static __inline __m256 __DEFAULT_FN_ATTRS 273 _mm256_min_ps(__m256 __a, __m256 __b) 274 { 275 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 276 } 277 278 /// \brief Multiplies two 256-bit vectors of [4 x double]. 279 /// 280 /// \headerfile <x86intrin.h> 281 /// 282 /// This intrinsic corresponds to the <c> VMULPD </c> instruction. 283 /// 284 /// \param __a 285 /// A 256-bit vector of [4 x double] containing one of the operands. 286 /// \param __b 287 /// A 256-bit vector of [4 x double] containing one of the operands. 288 /// \returns A 256-bit vector of [4 x double] containing the products of both 289 /// operands. 290 static __inline __m256d __DEFAULT_FN_ATTRS 291 _mm256_mul_pd(__m256d __a, __m256d __b) 292 { 293 return (__m256d)((__v4df)__a * (__v4df)__b); 294 } 295 296 /// \brief Multiplies two 256-bit vectors of [8 x float]. 297 /// 298 /// \headerfile <x86intrin.h> 299 /// 300 /// This intrinsic corresponds to the <c> VMULPS </c> instruction. 301 /// 302 /// \param __a 303 /// A 256-bit vector of [8 x float] containing one of the operands. 304 /// \param __b 305 /// A 256-bit vector of [8 x float] containing one of the operands. 306 /// \returns A 256-bit vector of [8 x float] containing the products of both 307 /// operands. 308 static __inline __m256 __DEFAULT_FN_ATTRS 309 _mm256_mul_ps(__m256 __a, __m256 __b) 310 { 311 return (__m256)((__v8sf)__a * (__v8sf)__b); 312 } 313 314 /// \brief Calculates the square roots of the values in a 256-bit vector of 315 /// [4 x double]. 316 /// 317 /// \headerfile <x86intrin.h> 318 /// 319 /// This intrinsic corresponds to the <c> VSQRTPD </c> instruction. 320 /// 321 /// \param __a 322 /// A 256-bit vector of [4 x double]. 323 /// \returns A 256-bit vector of [4 x double] containing the square roots of the 324 /// values in the operand. 325 static __inline __m256d __DEFAULT_FN_ATTRS 326 _mm256_sqrt_pd(__m256d __a) 327 { 328 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 329 } 330 331 /// \brief Calculates the square roots of the values in a 256-bit vector of 332 /// [8 x float]. 333 /// 334 /// \headerfile <x86intrin.h> 335 /// 336 /// This intrinsic corresponds to the <c> VSQRTPS </c> instruction. 337 /// 338 /// \param __a 339 /// A 256-bit vector of [8 x float]. 340 /// \returns A 256-bit vector of [8 x float] containing the square roots of the 341 /// values in the operand. 342 static __inline __m256 __DEFAULT_FN_ATTRS 343 _mm256_sqrt_ps(__m256 __a) 344 { 345 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 346 } 347 348 /// \brief Calculates the reciprocal square roots of the values in a 256-bit 349 /// vector of [8 x float]. 350 /// 351 /// \headerfile <x86intrin.h> 352 /// 353 /// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction. 354 /// 355 /// \param __a 356 /// A 256-bit vector of [8 x float]. 357 /// \returns A 256-bit vector of [8 x float] containing the reciprocal square 358 /// roots of the values in the operand. 359 static __inline __m256 __DEFAULT_FN_ATTRS 360 _mm256_rsqrt_ps(__m256 __a) 361 { 362 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 363 } 364 365 /// \brief Calculates the reciprocals of the values in a 256-bit vector of 366 /// [8 x float]. 367 /// 368 /// \headerfile <x86intrin.h> 369 /// 370 /// This intrinsic corresponds to the <c> VRCPPS </c> instruction. 371 /// 372 /// \param __a 373 /// A 256-bit vector of [8 x float]. 374 /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 375 /// values in the operand. 376 static __inline __m256 __DEFAULT_FN_ATTRS 377 _mm256_rcp_ps(__m256 __a) 378 { 379 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 380 } 381 382 /// \brief Rounds the values in a 256-bit vector of [4 x double] as specified 383 /// by the byte operand. The source values are rounded to integer values and 384 /// returned as 64-bit double-precision floating-point values. 385 /// 386 /// \headerfile <x86intrin.h> 387 /// 388 /// \code 389 /// __m256d _mm256_round_pd(__m256d V, const int M); 390 /// \endcode 391 /// 392 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 393 /// 394 /// \param V 395 /// A 256-bit vector of [4 x double]. 396 /// \param M 397 /// An integer value that specifies the rounding operation. \n 398 /// Bits [7:4] are reserved. \n 399 /// Bit [3] is a precision exception value: \n 400 /// 0: A normal PE exception is used. \n 401 /// 1: The PE field is not updated. \n 402 /// Bit [2] is the rounding control source: \n 403 /// 0: Use bits [1:0] of \a M. \n 404 /// 1: Use the current MXCSR setting. \n 405 /// Bits [1:0] contain the rounding control definition: \n 406 /// 00: Nearest. \n 407 /// 01: Downward (toward negative infinity). \n 408 /// 10: Upward (toward positive infinity). \n 409 /// 11: Truncated. 410 /// \returns A 256-bit vector of [4 x double] containing the rounded values. 411 #define _mm256_round_pd(V, M) __extension__ ({ \ 412 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) 413 414 /// \brief Rounds the values stored in a 256-bit vector of [8 x float] as 415 /// specified by the byte operand. The source values are rounded to integer 416 /// values and returned as floating-point values. 417 /// 418 /// \headerfile <x86intrin.h> 419 /// 420 /// \code 421 /// __m256 _mm256_round_ps(__m256 V, const int M); 422 /// \endcode 423 /// 424 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 425 /// 426 /// \param V 427 /// A 256-bit vector of [8 x float]. 428 /// \param M 429 /// An integer value that specifies the rounding operation. \n 430 /// Bits [7:4] are reserved. \n 431 /// Bit [3] is a precision exception value: \n 432 /// 0: A normal PE exception is used. \n 433 /// 1: The PE field is not updated. \n 434 /// Bit [2] is the rounding control source: \n 435 /// 0: Use bits [1:0] of \a M. \n 436 /// 1: Use the current MXCSR setting. \n 437 /// Bits [1:0] contain the rounding control definition: \n 438 /// 00: Nearest. \n 439 /// 01: Downward (toward negative infinity). \n 440 /// 10: Upward (toward positive infinity). \n 441 /// 11: Truncated. 442 /// \returns A 256-bit vector of [8 x float] containing the rounded values. 443 #define _mm256_round_ps(V, M) __extension__ ({ \ 444 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) 445 446 /// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The 447 /// source values are rounded up to integer values and returned as 64-bit 448 /// double-precision floating-point values. 449 /// 450 /// \headerfile <x86intrin.h> 451 /// 452 /// \code 453 /// __m256d _mm256_ceil_pd(__m256d V); 454 /// \endcode 455 /// 456 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 457 /// 458 /// \param V 459 /// A 256-bit vector of [4 x double]. 460 /// \returns A 256-bit vector of [4 x double] containing the rounded up values. 461 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 462 463 /// \brief Rounds down the values stored in a 256-bit vector of [4 x double]. 464 /// The source values are rounded down to integer values and returned as 465 /// 64-bit double-precision floating-point values. 466 /// 467 /// \headerfile <x86intrin.h> 468 /// 469 /// \code 470 /// __m256d _mm256_floor_pd(__m256d V); 471 /// \endcode 472 /// 473 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 474 /// 475 /// \param V 476 /// A 256-bit vector of [4 x double]. 477 /// \returns A 256-bit vector of [4 x double] containing the rounded down 478 /// values. 479 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 480 481 /// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The 482 /// source values are rounded up to integer values and returned as 483 /// floating-point values. 484 /// 485 /// \headerfile <x86intrin.h> 486 /// 487 /// \code 488 /// __m256 _mm256_ceil_ps(__m256 V); 489 /// \endcode 490 /// 491 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 492 /// 493 /// \param V 494 /// A 256-bit vector of [8 x float]. 495 /// \returns A 256-bit vector of [8 x float] containing the rounded up values. 496 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 497 498 /// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The 499 /// source values are rounded down to integer values and returned as 500 /// floating-point values. 501 /// 502 /// \headerfile <x86intrin.h> 503 /// 504 /// \code 505 /// __m256 _mm256_floor_ps(__m256 V); 506 /// \endcode 507 /// 508 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 509 /// 510 /// \param V 511 /// A 256-bit vector of [8 x float]. 512 /// \returns A 256-bit vector of [8 x float] containing the rounded down values. 513 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 514 515 /* Logical */ 516 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double]. 517 /// 518 /// \headerfile <x86intrin.h> 519 /// 520 /// This intrinsic corresponds to the <c> VANDPD </c> instruction. 521 /// 522 /// \param __a 523 /// A 256-bit vector of [4 x double] containing one of the source operands. 524 /// \param __b 525 /// A 256-bit vector of [4 x double] containing one of the source operands. 526 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 527 /// values between both operands. 528 static __inline __m256d __DEFAULT_FN_ATTRS 529 _mm256_and_pd(__m256d __a, __m256d __b) 530 { 531 return (__m256d)((__v4du)__a & (__v4du)__b); 532 } 533 534 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float]. 535 /// 536 /// \headerfile <x86intrin.h> 537 /// 538 /// This intrinsic corresponds to the <c> VANDPS </c> instruction. 539 /// 540 /// \param __a 541 /// A 256-bit vector of [8 x float] containing one of the source operands. 542 /// \param __b 543 /// A 256-bit vector of [8 x float] containing one of the source operands. 544 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 545 /// values between both operands. 546 static __inline __m256 __DEFAULT_FN_ATTRS 547 _mm256_and_ps(__m256 __a, __m256 __b) 548 { 549 return (__m256)((__v8su)__a & (__v8su)__b); 550 } 551 552 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using 553 /// the one's complement of the values contained in the first source operand. 554 /// 555 /// \headerfile <x86intrin.h> 556 /// 557 /// This intrinsic corresponds to the <c> VANDNPD </c> instruction. 558 /// 559 /// \param __a 560 /// A 256-bit vector of [4 x double] containing the left source operand. The 561 /// one's complement of this value is used in the bitwise AND. 562 /// \param __b 563 /// A 256-bit vector of [4 x double] containing the right source operand. 564 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 565 /// values of the second operand and the one's complement of the first 566 /// operand. 567 static __inline __m256d __DEFAULT_FN_ATTRS 568 _mm256_andnot_pd(__m256d __a, __m256d __b) 569 { 570 return (__m256d)(~(__v4du)__a & (__v4du)__b); 571 } 572 573 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using 574 /// the one's complement of the values contained in the first source operand. 575 /// 576 /// \headerfile <x86intrin.h> 577 /// 578 /// This intrinsic corresponds to the <c> VANDNPS </c> instruction. 579 /// 580 /// \param __a 581 /// A 256-bit vector of [8 x float] containing the left source operand. The 582 /// one's complement of this value is used in the bitwise AND. 583 /// \param __b 584 /// A 256-bit vector of [8 x float] containing the right source operand. 585 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 586 /// values of the second operand and the one's complement of the first 587 /// operand. 588 static __inline __m256 __DEFAULT_FN_ATTRS 589 _mm256_andnot_ps(__m256 __a, __m256 __b) 590 { 591 return (__m256)(~(__v8su)__a & (__v8su)__b); 592 } 593 594 /// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double]. 595 /// 596 /// \headerfile <x86intrin.h> 597 /// 598 /// This intrinsic corresponds to the <c> VORPD </c> instruction. 599 /// 600 /// \param __a 601 /// A 256-bit vector of [4 x double] containing one of the source operands. 602 /// \param __b 603 /// A 256-bit vector of [4 x double] containing one of the source operands. 604 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 605 /// values between both operands. 606 static __inline __m256d __DEFAULT_FN_ATTRS 607 _mm256_or_pd(__m256d __a, __m256d __b) 608 { 609 return (__m256d)((__v4du)__a | (__v4du)__b); 610 } 611 612 /// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float]. 613 /// 614 /// \headerfile <x86intrin.h> 615 /// 616 /// This intrinsic corresponds to the <c> VORPS </c> instruction. 617 /// 618 /// \param __a 619 /// A 256-bit vector of [8 x float] containing one of the source operands. 620 /// \param __b 621 /// A 256-bit vector of [8 x float] containing one of the source operands. 622 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 623 /// values between both operands. 624 static __inline __m256 __DEFAULT_FN_ATTRS 625 _mm256_or_ps(__m256 __a, __m256 __b) 626 { 627 return (__m256)((__v8su)__a | (__v8su)__b); 628 } 629 630 /// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 631 /// 632 /// \headerfile <x86intrin.h> 633 /// 634 /// This intrinsic corresponds to the <c> VXORPD </c> instruction. 635 /// 636 /// \param __a 637 /// A 256-bit vector of [4 x double] containing one of the source operands. 638 /// \param __b 639 /// A 256-bit vector of [4 x double] containing one of the source operands. 640 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 641 /// values between both operands. 642 static __inline __m256d __DEFAULT_FN_ATTRS 643 _mm256_xor_pd(__m256d __a, __m256d __b) 644 { 645 return (__m256d)((__v4du)__a ^ (__v4du)__b); 646 } 647 648 /// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 649 /// 650 /// \headerfile <x86intrin.h> 651 /// 652 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 653 /// 654 /// \param __a 655 /// A 256-bit vector of [8 x float] containing one of the source operands. 656 /// \param __b 657 /// A 256-bit vector of [8 x float] containing one of the source operands. 658 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 659 /// values between both operands. 660 static __inline __m256 __DEFAULT_FN_ATTRS 661 _mm256_xor_ps(__m256 __a, __m256 __b) 662 { 663 return (__m256)((__v8su)__a ^ (__v8su)__b); 664 } 665 666 /* Horizontal arithmetic */ 667 /// \brief Horizontally adds the adjacent pairs of values contained in two 668 /// 256-bit vectors of [4 x double]. 669 /// 670 /// \headerfile <x86intrin.h> 671 /// 672 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 673 /// 674 /// \param __a 675 /// A 256-bit vector of [4 x double] containing one of the source operands. 676 /// The horizontal sums of the values are returned in the even-indexed 677 /// elements of a vector of [4 x double]. 678 /// \param __b 679 /// A 256-bit vector of [4 x double] containing one of the source operands. 680 /// The horizontal sums of the values are returned in the odd-indexed 681 /// elements of a vector of [4 x double]. 682 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 683 /// both operands. 684 static __inline __m256d __DEFAULT_FN_ATTRS 685 _mm256_hadd_pd(__m256d __a, __m256d __b) 686 { 687 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 688 } 689 690 /// \brief Horizontally adds the adjacent pairs of values contained in two 691 /// 256-bit vectors of [8 x float]. 692 /// 693 /// \headerfile <x86intrin.h> 694 /// 695 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 696 /// 697 /// \param __a 698 /// A 256-bit vector of [8 x float] containing one of the source operands. 699 /// The horizontal sums of the values are returned in the elements with 700 /// index 0, 1, 4, 5 of a vector of [8 x float]. 701 /// \param __b 702 /// A 256-bit vector of [8 x float] containing one of the source operands. 703 /// The horizontal sums of the values are returned in the elements with 704 /// index 2, 3, 6, 7 of a vector of [8 x float]. 705 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 706 /// both operands. 707 static __inline __m256 __DEFAULT_FN_ATTRS 708 _mm256_hadd_ps(__m256 __a, __m256 __b) 709 { 710 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 711 } 712 713 /// \brief Horizontally subtracts the adjacent pairs of values contained in two 714 /// 256-bit vectors of [4 x double]. 715 /// 716 /// \headerfile <x86intrin.h> 717 /// 718 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 719 /// 720 /// \param __a 721 /// A 256-bit vector of [4 x double] containing one of the source operands. 722 /// The horizontal differences between the values are returned in the 723 /// even-indexed elements of a vector of [4 x double]. 724 /// \param __b 725 /// A 256-bit vector of [4 x double] containing one of the source operands. 726 /// The horizontal differences between the values are returned in the 727 /// odd-indexed elements of a vector of [4 x double]. 728 /// \returns A 256-bit vector of [4 x double] containing the horizontal 729 /// differences of both operands. 730 static __inline __m256d __DEFAULT_FN_ATTRS 731 _mm256_hsub_pd(__m256d __a, __m256d __b) 732 { 733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 734 } 735 736 /// \brief Horizontally subtracts the adjacent pairs of values contained in two 737 /// 256-bit vectors of [8 x float]. 738 /// 739 /// \headerfile <x86intrin.h> 740 /// 741 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 742 /// 743 /// \param __a 744 /// A 256-bit vector of [8 x float] containing one of the source operands. 745 /// The horizontal differences between the values are returned in the 746 /// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 747 /// \param __b 748 /// A 256-bit vector of [8 x float] containing one of the source operands. 749 /// The horizontal differences between the values are returned in the 750 /// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 751 /// \returns A 256-bit vector of [8 x float] containing the horizontal 752 /// differences of both operands. 753 static __inline __m256 __DEFAULT_FN_ATTRS 754 _mm256_hsub_ps(__m256 __a, __m256 __b) 755 { 756 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 757 } 758 759 /* Vector permutations */ 760 /// \brief Copies the values in a 128-bit vector of [2 x double] as specified 761 /// by the 128-bit integer vector operand. 762 /// 763 /// \headerfile <x86intrin.h> 764 /// 765 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 766 /// 767 /// \param __a 768 /// A 128-bit vector of [2 x double]. 769 /// \param __c 770 /// A 128-bit integer vector operand specifying how the values are to be 771 /// copied. \n 772 /// Bit [1]: \n 773 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 774 /// vector. \n 775 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 776 /// returned vector. \n 777 /// Bit [65]: \n 778 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 779 /// returned vector. \n 780 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 781 /// returned vector. 782 /// \returns A 128-bit vector of [2 x double] containing the copied values. 783 static __inline __m128d __DEFAULT_FN_ATTRS 784 _mm_permutevar_pd(__m128d __a, __m128i __c) 785 { 786 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 787 } 788 789 /// \brief Copies the values in a 256-bit vector of [4 x double] as specified 790 /// by the 256-bit integer vector operand. 791 /// 792 /// \headerfile <x86intrin.h> 793 /// 794 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 795 /// 796 /// \param __a 797 /// A 256-bit vector of [4 x double]. 798 /// \param __c 799 /// A 256-bit integer vector operand specifying how the values are to be 800 /// copied. \n 801 /// Bit [1]: \n 802 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 803 /// vector. \n 804 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 805 /// returned vector. \n 806 /// Bit [65]: \n 807 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 808 /// returned vector. \n 809 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 810 /// returned vector. \n 811 /// Bit [129]: \n 812 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 813 /// returned vector. \n 814 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 815 /// returned vector. \n 816 /// Bit [193]: \n 817 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 818 /// returned vector. \n 819 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 820 /// returned vector. 821 /// \returns A 256-bit vector of [4 x double] containing the copied values. 822 static __inline __m256d __DEFAULT_FN_ATTRS 823 _mm256_permutevar_pd(__m256d __a, __m256i __c) 824 { 825 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 826 } 827 828 /// \brief Copies the values stored in a 128-bit vector of [4 x float] as 829 /// specified by the 128-bit integer vector operand. 830 /// \headerfile <x86intrin.h> 831 /// 832 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 833 /// 834 /// \param __a 835 /// A 128-bit vector of [4 x float]. 836 /// \param __c 837 /// A 128-bit integer vector operand specifying how the values are to be 838 /// copied. \n 839 /// Bits [1:0]: \n 840 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 841 /// returned vector. \n 842 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 843 /// returned vector. \n 844 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 845 /// returned vector. \n 846 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 847 /// returned vector. \n 848 /// Bits [33:32]: \n 849 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 850 /// returned vector. \n 851 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 852 /// returned vector. \n 853 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 854 /// returned vector. \n 855 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 856 /// returned vector. \n 857 /// Bits [65:64]: \n 858 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 859 /// returned vector. \n 860 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 861 /// returned vector. \n 862 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 863 /// returned vector. \n 864 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 865 /// returned vector. \n 866 /// Bits [97:96]: \n 867 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 868 /// returned vector. \n 869 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 870 /// returned vector. \n 871 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 872 /// returned vector. \n 873 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 874 /// returned vector. 875 /// \returns A 128-bit vector of [4 x float] containing the copied values. 876 static __inline __m128 __DEFAULT_FN_ATTRS 877 _mm_permutevar_ps(__m128 __a, __m128i __c) 878 { 879 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 880 } 881 882 /// \brief Copies the values stored in a 256-bit vector of [8 x float] as 883 /// specified by the 256-bit integer vector operand. 884 /// 885 /// \headerfile <x86intrin.h> 886 /// 887 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 888 /// 889 /// \param __a 890 /// A 256-bit vector of [8 x float]. 891 /// \param __c 892 /// A 256-bit integer vector operand specifying how the values are to be 893 /// copied. \n 894 /// Bits [1:0]: \n 895 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 896 /// returned vector. \n 897 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 898 /// returned vector. \n 899 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 900 /// returned vector. \n 901 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 902 /// returned vector. \n 903 /// Bits [33:32]: \n 904 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 905 /// returned vector. \n 906 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 907 /// returned vector. \n 908 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 909 /// returned vector. \n 910 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 911 /// returned vector. \n 912 /// Bits [65:64]: \n 913 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 914 /// returned vector. \n 915 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 916 /// returned vector. \n 917 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 918 /// returned vector. \n 919 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 920 /// returned vector. \n 921 /// Bits [97:96]: \n 922 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 923 /// returned vector. \n 924 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 925 /// returned vector. \n 926 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 927 /// returned vector. \n 928 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 929 /// returned vector. \n 930 /// Bits [129:128]: \n 931 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 932 /// returned vector. \n 933 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 934 /// returned vector. \n 935 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 936 /// returned vector. \n 937 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 938 /// returned vector. \n 939 /// Bits [161:160]: \n 940 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 941 /// returned vector. \n 942 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 943 /// returned vector. \n 944 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 945 /// returned vector. \n 946 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 947 /// returned vector. \n 948 /// Bits [193:192]: \n 949 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 950 /// returned vector. \n 951 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 952 /// returned vector. \n 953 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 954 /// returned vector. \n 955 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 956 /// returned vector. \n 957 /// Bits [225:224]: \n 958 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 959 /// returned vector. \n 960 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 961 /// returned vector. \n 962 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 963 /// returned vector. \n 964 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 965 /// returned vector. 966 /// \returns A 256-bit vector of [8 x float] containing the copied values. 967 static __inline __m256 __DEFAULT_FN_ATTRS 968 _mm256_permutevar_ps(__m256 __a, __m256i __c) 969 { 970 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 971 } 972 973 /// \brief Copies the values in a 128-bit vector of [2 x double] as specified 974 /// by the immediate integer operand. 975 /// 976 /// \headerfile <x86intrin.h> 977 /// 978 /// \code 979 /// __m128d _mm_permute_pd(__m128d A, const int C); 980 /// \endcode 981 /// 982 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 983 /// 984 /// \param A 985 /// A 128-bit vector of [2 x double]. 986 /// \param C 987 /// An immediate integer operand specifying how the values are to be 988 /// copied. \n 989 /// Bit [0]: \n 990 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 991 /// vector. \n 992 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 993 /// returned vector. \n 994 /// Bit [1]: \n 995 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 996 /// returned vector. \n 997 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 998 /// returned vector. 999 /// \returns A 128-bit vector of [2 x double] containing the copied values. 1000 #define _mm_permute_pd(A, C) __extension__ ({ \ 1001 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ 1002 (__v2df)_mm_undefined_pd(), \ 1003 ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); }) 1004 1005 /// \brief Copies the values in a 256-bit vector of [4 x double] as specified by 1006 /// the immediate integer operand. 1007 /// 1008 /// \headerfile <x86intrin.h> 1009 /// 1010 /// \code 1011 /// __m256d _mm256_permute_pd(__m256d A, const int C); 1012 /// \endcode 1013 /// 1014 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 1015 /// 1016 /// \param A 1017 /// A 256-bit vector of [4 x double]. 1018 /// \param C 1019 /// An immediate integer operand specifying how the values are to be 1020 /// copied. \n 1021 /// Bit [0]: \n 1022 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1023 /// vector. \n 1024 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1025 /// returned vector. \n 1026 /// Bit [1]: \n 1027 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1028 /// returned vector. \n 1029 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1030 /// returned vector. \n 1031 /// Bit [2]: \n 1032 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1033 /// returned vector. \n 1034 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1035 /// returned vector. \n 1036 /// Bit [3]: \n 1037 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1038 /// returned vector. \n 1039 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1040 /// returned vector. 1041 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1042 #define _mm256_permute_pd(A, C) __extension__ ({ \ 1043 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ 1044 (__v4df)_mm256_undefined_pd(), \ 1045 0 + (((C) >> 0) & 0x1), \ 1046 0 + (((C) >> 1) & 0x1), \ 1047 2 + (((C) >> 2) & 0x1), \ 1048 2 + (((C) >> 3) & 0x1)); }) 1049 1050 /// \brief Copies the values in a 128-bit vector of [4 x float] as specified by 1051 /// the immediate integer operand. 1052 /// 1053 /// \headerfile <x86intrin.h> 1054 /// 1055 /// \code 1056 /// __m128 _mm_permute_ps(__m128 A, const int C); 1057 /// \endcode 1058 /// 1059 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1060 /// 1061 /// \param A 1062 /// A 128-bit vector of [4 x float]. 1063 /// \param C 1064 /// An immediate integer operand specifying how the values are to be 1065 /// copied. \n 1066 /// Bits [1:0]: \n 1067 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1068 /// returned vector. \n 1069 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1070 /// returned vector. \n 1071 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1072 /// returned vector. \n 1073 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1074 /// returned vector. \n 1075 /// Bits [3:2]: \n 1076 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1077 /// returned vector. \n 1078 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1079 /// returned vector. \n 1080 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1081 /// returned vector. \n 1082 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1083 /// returned vector. \n 1084 /// Bits [5:4]: \n 1085 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1086 /// returned vector. \n 1087 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1088 /// returned vector. \n 1089 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1090 /// returned vector. \n 1091 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1092 /// returned vector. \n 1093 /// Bits [7:6]: \n 1094 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1095 /// returned vector. \n 1096 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1097 /// returned vector. \n 1098 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1099 /// returned vector. \n 1100 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1101 /// returned vector. 1102 /// \returns A 128-bit vector of [4 x float] containing the copied values. 1103 #define _mm_permute_ps(A, C) __extension__ ({ \ 1104 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ 1105 (__v4sf)_mm_undefined_ps(), \ 1106 ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ 1107 ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) 1108 1109 /// \brief Copies the values in a 256-bit vector of [8 x float] as specified by 1110 /// the immediate integer operand. 1111 /// 1112 /// \headerfile <x86intrin.h> 1113 /// 1114 /// \code 1115 /// __m256 _mm256_permute_ps(__m256 A, const int C); 1116 /// \endcode 1117 /// 1118 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1119 /// 1120 /// \param A 1121 /// A 256-bit vector of [8 x float]. 1122 /// \param C 1123 /// An immediate integer operand specifying how the values are to be \n 1124 /// copied. \n 1125 /// Bits [1:0]: \n 1126 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1127 /// returned vector. \n 1128 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1129 /// returned vector. \n 1130 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1131 /// returned vector. \n 1132 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1133 /// returned vector. \n 1134 /// Bits [3:2]: \n 1135 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1136 /// returned vector. \n 1137 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1138 /// returned vector. \n 1139 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1140 /// returned vector. \n 1141 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1142 /// returned vector. \n 1143 /// Bits [5:4]: \n 1144 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1145 /// returned vector. \n 1146 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1147 /// returned vector. \n 1148 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1149 /// returned vector. \n 1150 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1151 /// returned vector. \n 1152 /// Bits [7:6]: \n 1153 /// 00: Bits [31:qq0] of the source are copied to bits [127:96] of the 1154 /// returned vector. \n 1155 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1156 /// returned vector. \n 1157 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1158 /// returned vector. \n 1159 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1160 /// returned vector. \n 1161 /// Bits [1:0]: \n 1162 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1163 /// returned vector. \n 1164 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1165 /// returned vector. \n 1166 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1167 /// returned vector. \n 1168 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1169 /// returned vector. \n 1170 /// Bits [3:2]: \n 1171 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1172 /// returned vector. \n 1173 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1174 /// returned vector. \n 1175 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1176 /// returned vector. \n 1177 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1178 /// returned vector. \n 1179 /// Bits [5:4]: \n 1180 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1181 /// returned vector. \n 1182 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1183 /// returned vector. \n 1184 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1185 /// returned vector. \n 1186 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1187 /// returned vector. \n 1188 /// Bits [7:6]: \n 1189 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1190 /// returned vector. \n 1191 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1192 /// returned vector. \n 1193 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1194 /// returned vector. \n 1195 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1196 /// returned vector. 1197 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1198 #define _mm256_permute_ps(A, C) __extension__ ({ \ 1199 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ 1200 (__v8sf)_mm256_undefined_ps(), \ 1201 0 + (((C) >> 0) & 0x3), \ 1202 0 + (((C) >> 2) & 0x3), \ 1203 0 + (((C) >> 4) & 0x3), \ 1204 0 + (((C) >> 6) & 0x3), \ 1205 4 + (((C) >> 0) & 0x3), \ 1206 4 + (((C) >> 2) & 0x3), \ 1207 4 + (((C) >> 4) & 0x3), \ 1208 4 + (((C) >> 6) & 0x3)); }) 1209 1210 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1211 /// [4 x double], as specified by the immediate integer operand. 1212 /// 1213 /// \headerfile <x86intrin.h> 1214 /// 1215 /// \code 1216 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); 1217 /// \endcode 1218 /// 1219 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1220 /// 1221 /// \param V1 1222 /// A 256-bit vector of [4 x double]. 1223 /// \param V2 1224 /// A 256-bit vector of [4 x double. 1225 /// \param M 1226 /// An immediate integer operand specifying how the values are to be 1227 /// permuted. \n 1228 /// Bits [1:0]: \n 1229 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1230 /// destination. \n 1231 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1232 /// destination. \n 1233 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1234 /// destination. \n 1235 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1236 /// destination. \n 1237 /// Bits [5:4]: \n 1238 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1239 /// destination. \n 1240 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1241 /// destination. \n 1242 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1243 /// destination. \n 1244 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1245 /// destination. 1246 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1247 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 1248 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1249 (__v4df)(__m256d)(V2), (M)); }) 1250 1251 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1252 /// [8 x float], as specified by the immediate integer operand. 1253 /// 1254 /// \headerfile <x86intrin.h> 1255 /// 1256 /// \code 1257 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); 1258 /// \endcode 1259 /// 1260 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1261 /// 1262 /// \param V1 1263 /// A 256-bit vector of [8 x float]. 1264 /// \param V2 1265 /// A 256-bit vector of [8 x float]. 1266 /// \param M 1267 /// An immediate integer operand specifying how the values are to be 1268 /// permuted. \n 1269 /// Bits [1:0]: \n 1270 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1271 /// destination. \n 1272 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1273 /// destination. \n 1274 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1275 /// destination. \n 1276 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1277 /// destination. \n 1278 /// Bits [5:4]: \n 1279 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1280 /// destination. \n 1281 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1282 /// destination. \n 1283 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1284 /// destination. \n 1285 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1286 /// destination. 1287 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1288 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 1289 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1290 (__v8sf)(__m256)(V2), (M)); }) 1291 1292 /// \brief Permutes 128-bit data values stored in two 256-bit integer vectors, 1293 /// as specified by the immediate integer operand. 1294 /// 1295 /// \headerfile <x86intrin.h> 1296 /// 1297 /// \code 1298 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); 1299 /// \endcode 1300 /// 1301 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1302 /// 1303 /// \param V1 1304 /// A 256-bit integer vector. 1305 /// \param V2 1306 /// A 256-bit integer vector. 1307 /// \param M 1308 /// An immediate integer operand specifying how the values are to be copied. 1309 /// Bits [1:0]: \n 1310 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1311 /// destination. \n 1312 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1313 /// destination. \n 1314 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1315 /// destination. \n 1316 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1317 /// destination. \n 1318 /// Bits [5:4]: \n 1319 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1320 /// destination. \n 1321 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1322 /// destination. \n 1323 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1324 /// destination. \n 1325 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1326 /// destination. 1327 /// \returns A 256-bit integer vector containing the copied values. 1328 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 1329 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1330 (__v8si)(__m256i)(V2), (M)); }) 1331 1332 /* Vector Blend */ 1333 /// \brief Merges 64-bit double-precision data values stored in either of the 1334 /// two 256-bit vectors of [4 x double], as specified by the immediate 1335 /// integer operand. 1336 /// 1337 /// \headerfile <x86intrin.h> 1338 /// 1339 /// \code 1340 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); 1341 /// \endcode 1342 /// 1343 /// This intrinsic corresponds to the <c> VBLENDPD </c> instruction. 1344 /// 1345 /// \param V1 1346 /// A 256-bit vector of [4 x double]. 1347 /// \param V2 1348 /// A 256-bit vector of [4 x double]. 1349 /// \param M 1350 /// An immediate integer operand, with mask bits [3:0] specifying how the 1351 /// values are to be copied. The position of the mask bit corresponds to the 1352 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 1353 /// element in operand \a V1 is copied to the same position in the 1354 /// destination. When a mask bit is 1, the corresponding 64-bit element in 1355 /// operand \a V2 is copied to the same position in the destination. 1356 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1357 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 1358 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ 1359 (__v4df)(__m256d)(V2), \ 1360 (((M) & 0x01) ? 4 : 0), \ 1361 (((M) & 0x02) ? 5 : 1), \ 1362 (((M) & 0x04) ? 6 : 2), \ 1363 (((M) & 0x08) ? 7 : 3)); }) 1364 1365 /// \brief Merges 32-bit single-precision data values stored in either of the 1366 /// two 256-bit vectors of [8 x float], as specified by the immediate 1367 /// integer operand. 1368 /// 1369 /// \headerfile <x86intrin.h> 1370 /// 1371 /// \code 1372 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); 1373 /// \endcode 1374 /// 1375 /// This intrinsic corresponds to the <c> VBLENDPS </c> instruction. 1376 /// 1377 /// \param V1 1378 /// A 256-bit vector of [8 x float]. 1379 /// \param V2 1380 /// A 256-bit vector of [8 x float]. 1381 /// \param M 1382 /// An immediate integer operand, with mask bits [7:0] specifying how the 1383 /// values are to be copied. The position of the mask bit corresponds to the 1384 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 1385 /// element in operand \a V1 is copied to the same position in the 1386 /// destination. When a mask bit is 1, the corresponding 32-bit element in 1387 /// operand \a V2 is copied to the same position in the destination. 1388 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1389 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 1390 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ 1391 (__v8sf)(__m256)(V2), \ 1392 (((M) & 0x01) ? 8 : 0), \ 1393 (((M) & 0x02) ? 9 : 1), \ 1394 (((M) & 0x04) ? 10 : 2), \ 1395 (((M) & 0x08) ? 11 : 3), \ 1396 (((M) & 0x10) ? 12 : 4), \ 1397 (((M) & 0x20) ? 13 : 5), \ 1398 (((M) & 0x40) ? 14 : 6), \ 1399 (((M) & 0x80) ? 15 : 7)); }) 1400 1401 /// \brief Merges 64-bit double-precision data values stored in either of the 1402 /// two 256-bit vectors of [4 x double], as specified by the 256-bit vector 1403 /// operand. 1404 /// 1405 /// \headerfile <x86intrin.h> 1406 /// 1407 /// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction. 1408 /// 1409 /// \param __a 1410 /// A 256-bit vector of [4 x double]. 1411 /// \param __b 1412 /// A 256-bit vector of [4 x double]. 1413 /// \param __c 1414 /// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying 1415 /// how the values are to be copied. The position of the mask bit corresponds 1416 /// to the most significant bit of a copied value. When a mask bit is 0, the 1417 /// corresponding 64-bit element in operand \a __a is copied to the same 1418 /// position in the destination. When a mask bit is 1, the corresponding 1419 /// 64-bit element in operand \a __b is copied to the same position in the 1420 /// destination. 1421 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1422 static __inline __m256d __DEFAULT_FN_ATTRS 1423 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1424 { 1425 return (__m256d)__builtin_ia32_blendvpd256( 1426 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1427 } 1428 1429 /// \brief Merges 32-bit single-precision data values stored in either of the 1430 /// two 256-bit vectors of [8 x float], as specified by the 256-bit vector 1431 /// operand. 1432 /// 1433 /// \headerfile <x86intrin.h> 1434 /// 1435 /// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction. 1436 /// 1437 /// \param __a 1438 /// A 256-bit vector of [8 x float]. 1439 /// \param __b 1440 /// A 256-bit vector of [8 x float]. 1441 /// \param __c 1442 /// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, 1443 /// and 31 specifying how the values are to be copied. The position of the 1444 /// mask bit corresponds to the most significant bit of a copied value. When 1445 /// a mask bit is 0, the corresponding 32-bit element in operand \a __a is 1446 /// copied to the same position in the destination. When a mask bit is 1, the 1447 /// corresponding 32-bit element in operand \a __b is copied to the same 1448 /// position in the destination. 1449 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1450 static __inline __m256 __DEFAULT_FN_ATTRS 1451 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1452 { 1453 return (__m256)__builtin_ia32_blendvps256( 1454 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1455 } 1456 1457 /* Vector Dot Product */ 1458 /// \brief Computes two dot products in parallel, using the lower and upper 1459 /// halves of two [8 x float] vectors as input to the two computations, and 1460 /// returning the two dot products in the lower and upper halves of the 1461 /// [8 x float] result. The immediate integer operand controls which input 1462 /// elements will contribute to the dot product, and where the final results 1463 /// are returned. In general, for each dot product, the four corresponding 1464 /// elements of the input vectors are multiplied; the first two and second 1465 /// two products are summed, then the two sums are added to form the final 1466 /// result. 1467 /// 1468 /// \headerfile <x86intrin.h> 1469 /// 1470 /// \code 1471 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); 1472 /// \endcode 1473 /// 1474 /// This intrinsic corresponds to the <c> VDPPS </c> instruction. 1475 /// 1476 /// \param V1 1477 /// A vector of [8 x float] values, treated as two [4 x float] vectors. 1478 /// \param V2 1479 /// A vector of [8 x float] values, treated as two [4 x float] vectors. 1480 /// \param M 1481 /// An immediate integer argument. Bits [7:4] determine which elements of 1482 /// the input vectors are used, with bit [4] corresponding to the lowest 1483 /// element and bit [7] corresponding to the highest element of each [4 x 1484 /// float] subvector. If a bit is set, the corresponding elements from the 1485 /// two input vectors are used as an input for dot product; otherwise that 1486 /// input is treated as zero. Bits [3:0] determine which elements of the 1487 /// result will receive a copy of the final dot product, with bit [0] 1488 /// corresponding to the lowest element and bit [3] corresponding to the 1489 /// highest element of each [4 x float] subvector. If a bit is set, the dot 1490 /// product is returned in the corresponding element; otherwise that element 1491 /// is set to zero. The bitmask is applied in the same way to each of the 1492 /// two parallel dot product computations. 1493 /// \returns A 256-bit vector of [8 x float] containing the two dot products. 1494 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 1495 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1496 (__v8sf)(__m256)(V2), (M)); }) 1497 1498 /* Vector shuffle */ 1499 /// \brief Selects 8 float values from the 256-bit operands of [8 x float], as 1500 /// specified by the immediate value operand. The four selected elements in 1501 /// each operand are copied to the destination according to the bits 1502 /// specified in the immediate operand. The selected elements from the first 1503 /// 256-bit operand are copied to bits [63:0] and bits [191:128] of the 1504 /// destination, and the selected elements from the second 256-bit operand 1505 /// are copied to bits [127:64] and bits [255:192] of the destination. For 1506 /// example, if bits [7:0] of the immediate operand contain a value of 0xFF, 1507 /// the 256-bit destination vector would contain the following values: b[7], 1508 /// b[7], a[7], a[7], b[3], b[3], a[3], a[3]. 1509 /// 1510 /// \headerfile <x86intrin.h> 1511 /// 1512 /// \code 1513 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); 1514 /// \endcode 1515 /// 1516 /// This intrinsic corresponds to the <c> VSHUFPS </c> instruction. 1517 /// 1518 /// \param a 1519 /// A 256-bit vector of [8 x float]. The four selected elements in this 1520 /// operand are copied to bits [63:0] and bits [191:128] in the destination, 1521 /// according to the bits specified in the immediate operand. 1522 /// \param b 1523 /// A 256-bit vector of [8 x float]. The four selected elements in this 1524 /// operand are copied to bits [127:64] and bits [255:192] in the 1525 /// destination, according to the bits specified in the immediate operand. 1526 /// \param mask 1527 /// An immediate value containing an 8-bit value specifying which elements to 1528 /// copy from \a a and \a b \n. 1529 /// Bits [3:0] specify the values copied from operand \a a. \n 1530 /// Bits [7:4] specify the values copied from operand \a b. \n 1531 /// The destinations within the 256-bit destination are assigned values as 1532 /// follows, according to the bit value assignments described below: \n 1533 /// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the 1534 /// destination. \n 1535 /// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the 1536 /// destination. \n 1537 /// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the 1538 /// destination. \n 1539 /// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in 1540 /// the destination. \n 1541 /// Bit value assignments: \n 1542 /// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n 1543 /// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n 1544 /// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n 1545 /// 11: Bits [127:96] and [255:224] are copied from the selected operand. 1546 /// \returns A 256-bit vector of [8 x float] containing the shuffled values. 1547 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 1548 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ 1549 (__v8sf)(__m256)(b), \ 1550 0 + (((mask) >> 0) & 0x3), \ 1551 0 + (((mask) >> 2) & 0x3), \ 1552 8 + (((mask) >> 4) & 0x3), \ 1553 8 + (((mask) >> 6) & 0x3), \ 1554 4 + (((mask) >> 0) & 0x3), \ 1555 4 + (((mask) >> 2) & 0x3), \ 1556 12 + (((mask) >> 4) & 0x3), \ 1557 12 + (((mask) >> 6) & 0x3)); }) 1558 1559 /// \brief Selects four double-precision values from the 256-bit operands of 1560 /// [4 x double], as specified by the immediate value operand. The selected 1561 /// elements from the first 256-bit operand are copied to bits [63:0] and 1562 /// bits [191:128] in the destination, and the selected elements from the 1563 /// second 256-bit operand are copied to bits [127:64] and bits [255:192] in 1564 /// the destination. For example, if bits [3:0] of the immediate operand 1565 /// contain a value of 0xF, the 256-bit destination vector would contain the 1566 /// following values: b[3], a[3], b[1], a[1]. 1567 /// 1568 /// \headerfile <x86intrin.h> 1569 /// 1570 /// \code 1571 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); 1572 /// \endcode 1573 /// 1574 /// This intrinsic corresponds to the <c> VSHUFPD </c> instruction. 1575 /// 1576 /// \param a 1577 /// A 256-bit vector of [4 x double]. 1578 /// \param b 1579 /// A 256-bit vector of [4 x double]. 1580 /// \param mask 1581 /// An immediate value containing 8-bit values specifying which elements to 1582 /// copy from \a a and \a b: \n 1583 /// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the 1584 /// destination. \n 1585 /// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the 1586 /// destination. \n 1587 /// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the 1588 /// destination. \n 1589 /// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the 1590 /// destination. \n 1591 /// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the 1592 /// destination. \n 1593 /// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the 1594 /// destination. \n 1595 /// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the 1596 /// destination. \n 1597 /// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the 1598 /// destination. 1599 /// \returns A 256-bit vector of [4 x double] containing the shuffled values. 1600 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 1601 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ 1602 (__v4df)(__m256d)(b), \ 1603 0 + (((mask) >> 0) & 0x1), \ 1604 4 + (((mask) >> 1) & 0x1), \ 1605 2 + (((mask) >> 2) & 0x1), \ 1606 6 + (((mask) >> 3) & 0x1)); }) 1607 1608 /* Compare */ 1609 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 1610 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 1611 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 1612 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 1613 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 1614 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 1615 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 1616 #define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */ 1617 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1618 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ 1619 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1620 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1621 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1622 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1623 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1624 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1625 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1626 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1627 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1628 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1629 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1630 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1631 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ 1632 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1633 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1634 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ 1635 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1636 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1637 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1638 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1639 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1640 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1641 1642 /// \brief Compares each of the corresponding double-precision values of two 1643 /// 128-bit vectors of [2 x double], using the operation specified by the 1644 /// immediate integer operand. Returns a [2 x double] vector consisting of 1645 /// two doubles corresponding to the two comparison results: zero if the 1646 /// comparison is false, and all 1's if the comparison is true. 1647 /// 1648 /// \headerfile <x86intrin.h> 1649 /// 1650 /// \code 1651 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 1652 /// \endcode 1653 /// 1654 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1655 /// 1656 /// \param a 1657 /// A 128-bit vector of [2 x double]. 1658 /// \param b 1659 /// A 128-bit vector of [2 x double]. 1660 /// \param c 1661 /// An immediate integer operand, with bits [4:0] specifying which comparison 1662 /// operation to use: \n 1663 /// 0x00 : Equal (ordered, non-signaling) 1664 /// 0x01 : Less-than (ordered, signaling) 1665 /// 0x02 : Less-than-or-equal (ordered, signaling) 1666 /// 0x03 : Unordered (non-signaling) 1667 /// 0x04 : Not-equal (unordered, non-signaling) 1668 /// 0x05 : Not-less-than (unordered, signaling) 1669 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1670 /// 0x07 : Ordered (non-signaling) 1671 /// 0x08 : Equal (unordered, non-signaling) 1672 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1673 /// 0x0a : Not-greater-than (unordered, signaling) 1674 /// 0x0b : False (ordered, non-signaling) 1675 /// 0x0c : Not-equal (ordered, non-signaling) 1676 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1677 /// 0x0e : Greater-than (ordered, signaling) 1678 /// 0x0f : True (unordered, non-signaling) 1679 /// 0x10 : Equal (ordered, signaling) 1680 /// 0x11 : Less-than (ordered, non-signaling) 1681 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1682 /// 0x13 : Unordered (signaling) 1683 /// 0x14 : Not-equal (unordered, signaling) 1684 /// 0x15 : Not-less-than (unordered, non-signaling) 1685 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1686 /// 0x17 : Ordered (signaling) 1687 /// 0x18 : Equal (unordered, signaling) 1688 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1689 /// 0x1a : Not-greater-than (unordered, non-signaling) 1690 /// 0x1b : False (ordered, signaling) 1691 /// 0x1c : Not-equal (ordered, signaling) 1692 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1693 /// 0x1e : Greater-than (ordered, non-signaling) 1694 /// 0x1f : True (unordered, signaling) 1695 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 1696 #define _mm_cmp_pd(a, b, c) __extension__ ({ \ 1697 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ 1698 (__v2df)(__m128d)(b), (c)); }) 1699 1700 /// \brief Compares each of the corresponding values of two 128-bit vectors of 1701 /// [4 x float], using the operation specified by the immediate integer 1702 /// operand. Returns a [4 x float] vector consisting of four floats 1703 /// corresponding to the four comparison results: zero if the comparison is 1704 /// false, and all 1's if the comparison is true. 1705 /// 1706 /// \headerfile <x86intrin.h> 1707 /// 1708 /// \code 1709 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); 1710 /// \endcode 1711 /// 1712 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1713 /// 1714 /// \param a 1715 /// A 128-bit vector of [4 x float]. 1716 /// \param b 1717 /// A 128-bit vector of [4 x float]. 1718 /// \param c 1719 /// An immediate integer operand, with bits [4:0] specifying which comparison 1720 /// operation to use: \n 1721 /// 0x00 : Equal (ordered, non-signaling) 1722 /// 0x01 : Less-than (ordered, signaling) 1723 /// 0x02 : Less-than-or-equal (ordered, signaling) 1724 /// 0x03 : Unordered (non-signaling) 1725 /// 0x04 : Not-equal (unordered, non-signaling) 1726 /// 0x05 : Not-less-than (unordered, signaling) 1727 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1728 /// 0x07 : Ordered (non-signaling) 1729 /// 0x08 : Equal (unordered, non-signaling) 1730 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1731 /// 0x0a : Not-greater-than (unordered, signaling) 1732 /// 0x0b : False (ordered, non-signaling) 1733 /// 0x0c : Not-equal (ordered, non-signaling) 1734 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1735 /// 0x0e : Greater-than (ordered, signaling) 1736 /// 0x0f : True (unordered, non-signaling) 1737 /// 0x10 : Equal (ordered, signaling) 1738 /// 0x11 : Less-than (ordered, non-signaling) 1739 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1740 /// 0x13 : Unordered (signaling) 1741 /// 0x14 : Not-equal (unordered, signaling) 1742 /// 0x15 : Not-less-than (unordered, non-signaling) 1743 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1744 /// 0x17 : Ordered (signaling) 1745 /// 0x18 : Equal (unordered, signaling) 1746 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1747 /// 0x1a : Not-greater-than (unordered, non-signaling) 1748 /// 0x1b : False (ordered, signaling) 1749 /// 0x1c : Not-equal (ordered, signaling) 1750 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1751 /// 0x1e : Greater-than (ordered, non-signaling) 1752 /// 0x1f : True (unordered, signaling) 1753 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 1754 #define _mm_cmp_ps(a, b, c) __extension__ ({ \ 1755 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ 1756 (__v4sf)(__m128)(b), (c)); }) 1757 1758 /// \brief Compares each of the corresponding double-precision values of two 1759 /// 256-bit vectors of [4 x double], using the operation specified by the 1760 /// immediate integer operand. Returns a [4 x double] vector consisting of 1761 /// four doubles corresponding to the four comparison results: zero if the 1762 /// comparison is false, and all 1's if the comparison is true. 1763 /// 1764 /// \headerfile <x86intrin.h> 1765 /// 1766 /// \code 1767 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); 1768 /// \endcode 1769 /// 1770 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1771 /// 1772 /// \param a 1773 /// A 256-bit vector of [4 x double]. 1774 /// \param b 1775 /// A 256-bit vector of [4 x double]. 1776 /// \param c 1777 /// An immediate integer operand, with bits [4:0] specifying which comparison 1778 /// operation to use: \n 1779 /// 0x00 : Equal (ordered, non-signaling) 1780 /// 0x01 : Less-than (ordered, signaling) 1781 /// 0x02 : Less-than-or-equal (ordered, signaling) 1782 /// 0x03 : Unordered (non-signaling) 1783 /// 0x04 : Not-equal (unordered, non-signaling) 1784 /// 0x05 : Not-less-than (unordered, signaling) 1785 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1786 /// 0x07 : Ordered (non-signaling) 1787 /// 0x08 : Equal (unordered, non-signaling) 1788 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1789 /// 0x0a : Not-greater-than (unordered, signaling) 1790 /// 0x0b : False (ordered, non-signaling) 1791 /// 0x0c : Not-equal (ordered, non-signaling) 1792 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1793 /// 0x0e : Greater-than (ordered, signaling) 1794 /// 0x0f : True (unordered, non-signaling) 1795 /// 0x10 : Equal (ordered, signaling) 1796 /// 0x11 : Less-than (ordered, non-signaling) 1797 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1798 /// 0x13 : Unordered (signaling) 1799 /// 0x14 : Not-equal (unordered, signaling) 1800 /// 0x15 : Not-less-than (unordered, non-signaling) 1801 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1802 /// 0x17 : Ordered (signaling) 1803 /// 0x18 : Equal (unordered, signaling) 1804 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1805 /// 0x1a : Not-greater-than (unordered, non-signaling) 1806 /// 0x1b : False (ordered, signaling) 1807 /// 0x1c : Not-equal (ordered, signaling) 1808 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1809 /// 0x1e : Greater-than (ordered, non-signaling) 1810 /// 0x1f : True (unordered, signaling) 1811 /// \returns A 256-bit vector of [4 x double] containing the comparison results. 1812 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 1813 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1814 (__v4df)(__m256d)(b), (c)); }) 1815 1816 /// \brief Compares each of the corresponding values of two 256-bit vectors of 1817 /// [8 x float], using the operation specified by the immediate integer 1818 /// operand. Returns a [8 x float] vector consisting of eight floats 1819 /// corresponding to the eight comparison results: zero if the comparison is 1820 /// false, and all 1's if the comparison is true. 1821 /// 1822 /// \headerfile <x86intrin.h> 1823 /// 1824 /// \code 1825 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); 1826 /// \endcode 1827 /// 1828 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1829 /// 1830 /// \param a 1831 /// A 256-bit vector of [8 x float]. 1832 /// \param b 1833 /// A 256-bit vector of [8 x float]. 1834 /// \param c 1835 /// An immediate integer operand, with bits [4:0] specifying which comparison 1836 /// operation to use: \n 1837 /// 0x00 : Equal (ordered, non-signaling) 1838 /// 0x01 : Less-than (ordered, signaling) 1839 /// 0x02 : Less-than-or-equal (ordered, signaling) 1840 /// 0x03 : Unordered (non-signaling) 1841 /// 0x04 : Not-equal (unordered, non-signaling) 1842 /// 0x05 : Not-less-than (unordered, signaling) 1843 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1844 /// 0x07 : Ordered (non-signaling) 1845 /// 0x08 : Equal (unordered, non-signaling) 1846 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1847 /// 0x0a : Not-greater-than (unordered, signaling) 1848 /// 0x0b : False (ordered, non-signaling) 1849 /// 0x0c : Not-equal (ordered, non-signaling) 1850 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1851 /// 0x0e : Greater-than (ordered, signaling) 1852 /// 0x0f : True (unordered, non-signaling) 1853 /// 0x10 : Equal (ordered, signaling) 1854 /// 0x11 : Less-than (ordered, non-signaling) 1855 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1856 /// 0x13 : Unordered (signaling) 1857 /// 0x14 : Not-equal (unordered, signaling) 1858 /// 0x15 : Not-less-than (unordered, non-signaling) 1859 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1860 /// 0x17 : Ordered (signaling) 1861 /// 0x18 : Equal (unordered, signaling) 1862 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1863 /// 0x1a : Not-greater-than (unordered, non-signaling) 1864 /// 0x1b : False (ordered, signaling) 1865 /// 0x1c : Not-equal (ordered, signaling) 1866 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1867 /// 0x1e : Greater-than (ordered, non-signaling) 1868 /// 0x1f : True (unordered, signaling) 1869 /// \returns A 256-bit vector of [8 x float] containing the comparison results. 1870 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 1871 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1872 (__v8sf)(__m256)(b), (c)); }) 1873 1874 /// \brief Compares each of the corresponding scalar double-precision values of 1875 /// two 128-bit vectors of [2 x double], using the operation specified by the 1876 /// immediate integer operand. If the result is true, all 64 bits of the 1877 /// destination vector are set; otherwise they are cleared. 1878 /// 1879 /// \headerfile <x86intrin.h> 1880 /// 1881 /// \code 1882 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 1883 /// \endcode 1884 /// 1885 /// This intrinsic corresponds to the <c> VCMPSD </c> instruction. 1886 /// 1887 /// \param a 1888 /// A 128-bit vector of [2 x double]. 1889 /// \param b 1890 /// A 128-bit vector of [2 x double]. 1891 /// \param c 1892 /// An immediate integer operand, with bits [4:0] specifying which comparison 1893 /// operation to use: \n 1894 /// 0x00 : Equal (ordered, non-signaling) 1895 /// 0x01 : Less-than (ordered, signaling) 1896 /// 0x02 : Less-than-or-equal (ordered, signaling) 1897 /// 0x03 : Unordered (non-signaling) 1898 /// 0x04 : Not-equal (unordered, non-signaling) 1899 /// 0x05 : Not-less-than (unordered, signaling) 1900 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1901 /// 0x07 : Ordered (non-signaling) 1902 /// 0x08 : Equal (unordered, non-signaling) 1903 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1904 /// 0x0a : Not-greater-than (unordered, signaling) 1905 /// 0x0b : False (ordered, non-signaling) 1906 /// 0x0c : Not-equal (ordered, non-signaling) 1907 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1908 /// 0x0e : Greater-than (ordered, signaling) 1909 /// 0x0f : True (unordered, non-signaling) 1910 /// 0x10 : Equal (ordered, signaling) 1911 /// 0x11 : Less-than (ordered, non-signaling) 1912 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1913 /// 0x13 : Unordered (signaling) 1914 /// 0x14 : Not-equal (unordered, signaling) 1915 /// 0x15 : Not-less-than (unordered, non-signaling) 1916 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1917 /// 0x17 : Ordered (signaling) 1918 /// 0x18 : Equal (unordered, signaling) 1919 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1920 /// 0x1a : Not-greater-than (unordered, non-signaling) 1921 /// 0x1b : False (ordered, signaling) 1922 /// 0x1c : Not-equal (ordered, signaling) 1923 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1924 /// 0x1e : Greater-than (ordered, non-signaling) 1925 /// 0x1f : True (unordered, signaling) 1926 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 1927 #define _mm_cmp_sd(a, b, c) __extension__ ({ \ 1928 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ 1929 (__v2df)(__m128d)(b), (c)); }) 1930 1931 /// \brief Compares each of the corresponding scalar values of two 128-bit 1932 /// vectors of [4 x float], using the operation specified by the immediate 1933 /// integer operand. If the result is true, all 32 bits of the destination 1934 /// vector are set; otherwise they are cleared. 1935 /// 1936 /// \headerfile <x86intrin.h> 1937 /// 1938 /// \code 1939 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); 1940 /// \endcode 1941 /// 1942 /// This intrinsic corresponds to the <c> VCMPSS </c> instruction. 1943 /// 1944 /// \param a 1945 /// A 128-bit vector of [4 x float]. 1946 /// \param b 1947 /// A 128-bit vector of [4 x float]. 1948 /// \param c 1949 /// An immediate integer operand, with bits [4:0] specifying which comparison 1950 /// operation to use: \n 1951 /// 0x00 : Equal (ordered, non-signaling) 1952 /// 0x01 : Less-than (ordered, signaling) 1953 /// 0x02 : Less-than-or-equal (ordered, signaling) 1954 /// 0x03 : Unordered (non-signaling) 1955 /// 0x04 : Not-equal (unordered, non-signaling) 1956 /// 0x05 : Not-less-than (unordered, signaling) 1957 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1958 /// 0x07 : Ordered (non-signaling) 1959 /// 0x08 : Equal (unordered, non-signaling) 1960 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1961 /// 0x0a : Not-greater-than (unordered, signaling) 1962 /// 0x0b : False (ordered, non-signaling) 1963 /// 0x0c : Not-equal (ordered, non-signaling) 1964 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1965 /// 0x0e : Greater-than (ordered, signaling) 1966 /// 0x0f : True (unordered, non-signaling) 1967 /// 0x10 : Equal (ordered, signaling) 1968 /// 0x11 : Less-than (ordered, non-signaling) 1969 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1970 /// 0x13 : Unordered (signaling) 1971 /// 0x14 : Not-equal (unordered, signaling) 1972 /// 0x15 : Not-less-than (unordered, non-signaling) 1973 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1974 /// 0x17 : Ordered (signaling) 1975 /// 0x18 : Equal (unordered, signaling) 1976 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1977 /// 0x1a : Not-greater-than (unordered, non-signaling) 1978 /// 0x1b : False (ordered, signaling) 1979 /// 0x1c : Not-equal (ordered, signaling) 1980 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1981 /// 0x1e : Greater-than (ordered, non-signaling) 1982 /// 0x1f : True (unordered, signaling) 1983 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 1984 #define _mm_cmp_ss(a, b, c) __extension__ ({ \ 1985 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ 1986 (__v4sf)(__m128)(b), (c)); }) 1987 1988 /// \brief Takes a [8 x i32] vector and returns the vector element value 1989 /// indexed by the immediate constant operand. 1990 /// 1991 /// \headerfile <x86intrin.h> 1992 /// 1993 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1994 /// instruction. 1995 /// 1996 /// \param __a 1997 /// A 256-bit vector of [8 x i32]. 1998 /// \param __imm 1999 /// An immediate integer operand with bits [2:0] determining which vector 2000 /// element is extracted and returned. 2001 /// \returns A 32-bit integer containing the extracted 32 bits of extended 2002 /// packed data. 2003 static __inline int __DEFAULT_FN_ATTRS 2004 _mm256_extract_epi32(__m256i __a, const int __imm) 2005 { 2006 __v8si __b = (__v8si)__a; 2007 return __b[__imm & 7]; 2008 } 2009 2010 /// \brief Takes a [16 x i16] vector and returns the vector element value 2011 /// indexed by the immediate constant operand. 2012 /// 2013 /// \headerfile <x86intrin.h> 2014 /// 2015 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2016 /// instruction. 2017 /// 2018 /// \param __a 2019 /// A 256-bit integer vector of [16 x i16]. 2020 /// \param __imm 2021 /// An immediate integer operand with bits [3:0] determining which vector 2022 /// element is extracted and returned. 2023 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended 2024 /// packed data. 2025 static __inline int __DEFAULT_FN_ATTRS 2026 _mm256_extract_epi16(__m256i __a, const int __imm) 2027 { 2028 __v16hi __b = (__v16hi)__a; 2029 return (unsigned short)__b[__imm & 15]; 2030 } 2031 2032 /// \brief Takes a [32 x i8] vector and returns the vector element value 2033 /// indexed by the immediate constant operand. 2034 /// 2035 /// \headerfile <x86intrin.h> 2036 /// 2037 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2038 /// instruction. 2039 /// 2040 /// \param __a 2041 /// A 256-bit integer vector of [32 x i8]. 2042 /// \param __imm 2043 /// An immediate integer operand with bits [4:0] determining which vector 2044 /// element is extracted and returned. 2045 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended 2046 /// packed data. 2047 static __inline int __DEFAULT_FN_ATTRS 2048 _mm256_extract_epi8(__m256i __a, const int __imm) 2049 { 2050 __v32qi __b = (__v32qi)__a; 2051 return (unsigned char)__b[__imm & 31]; 2052 } 2053 2054 #ifdef __x86_64__ 2055 /// \brief Takes a [4 x i64] vector and returns the vector element value 2056 /// indexed by the immediate constant operand. 2057 /// 2058 /// \headerfile <x86intrin.h> 2059 /// 2060 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2061 /// instruction. 2062 /// 2063 /// \param __a 2064 /// A 256-bit integer vector of [4 x i64]. 2065 /// \param __imm 2066 /// An immediate integer operand with bits [1:0] determining which vector 2067 /// element is extracted and returned. 2068 /// \returns A 64-bit integer containing the extracted 64 bits of extended 2069 /// packed data. 2070 static __inline long long __DEFAULT_FN_ATTRS 2071 _mm256_extract_epi64(__m256i __a, const int __imm) 2072 { 2073 __v4di __b = (__v4di)__a; 2074 return __b[__imm & 3]; 2075 } 2076 #endif 2077 2078 /// \brief Takes a [8 x i32] vector and replaces the vector element value 2079 /// indexed by the immediate constant operand by a new value. Returns the 2080 /// modified vector. 2081 /// 2082 /// \headerfile <x86intrin.h> 2083 /// 2084 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2085 /// instruction. 2086 /// 2087 /// \param __a 2088 /// A vector of [8 x i32] to be used by the insert operation. 2089 /// \param __b 2090 /// An integer value. The replacement value for the insert operation. 2091 /// \param __imm 2092 /// An immediate integer specifying the index of the vector element to be 2093 /// replaced. 2094 /// \returns A copy of vector \a __a, after replacing its element indexed by 2095 /// \a __imm with \a __b. 2096 static __inline __m256i __DEFAULT_FN_ATTRS 2097 _mm256_insert_epi32(__m256i __a, int __b, int const __imm) 2098 { 2099 __v8si __c = (__v8si)__a; 2100 __c[__imm & 7] = __b; 2101 return (__m256i)__c; 2102 } 2103 2104 2105 /// \brief Takes a [16 x i16] vector and replaces the vector element value 2106 /// indexed by the immediate constant operand with a new value. Returns the 2107 /// modified vector. 2108 /// 2109 /// \headerfile <x86intrin.h> 2110 /// 2111 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2112 /// instruction. 2113 /// 2114 /// \param __a 2115 /// A vector of [16 x i16] to be used by the insert operation. 2116 /// \param __b 2117 /// An i16 integer value. The replacement value for the insert operation. 2118 /// \param __imm 2119 /// An immediate integer specifying the index of the vector element to be 2120 /// replaced. 2121 /// \returns A copy of vector \a __a, after replacing its element indexed by 2122 /// \a __imm with \a __b. 2123 static __inline __m256i __DEFAULT_FN_ATTRS 2124 _mm256_insert_epi16(__m256i __a, int __b, int const __imm) 2125 { 2126 __v16hi __c = (__v16hi)__a; 2127 __c[__imm & 15] = __b; 2128 return (__m256i)__c; 2129 } 2130 2131 /// \brief Takes a [32 x i8] vector and replaces the vector element value 2132 /// indexed by the immediate constant operand with a new value. Returns the 2133 /// modified vector. 2134 /// 2135 /// \headerfile <x86intrin.h> 2136 /// 2137 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2138 /// instruction. 2139 /// 2140 /// \param __a 2141 /// A vector of [32 x i8] to be used by the insert operation. 2142 /// \param __b 2143 /// An i8 integer value. The replacement value for the insert operation. 2144 /// \param __imm 2145 /// An immediate integer specifying the index of the vector element to be 2146 /// replaced. 2147 /// \returns A copy of vector \a __a, after replacing its element indexed by 2148 /// \a __imm with \a __b. 2149 static __inline __m256i __DEFAULT_FN_ATTRS 2150 _mm256_insert_epi8(__m256i __a, int __b, int const __imm) 2151 { 2152 __v32qi __c = (__v32qi)__a; 2153 __c[__imm & 31] = __b; 2154 return (__m256i)__c; 2155 } 2156 2157 #ifdef __x86_64__ 2158 /// \brief Takes a [4 x i64] vector and replaces the vector element value 2159 /// indexed by the immediate constant operand with a new value. Returns the 2160 /// modified vector. 2161 /// 2162 /// \headerfile <x86intrin.h> 2163 /// 2164 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2165 /// instruction. 2166 /// 2167 /// \param __a 2168 /// A vector of [4 x i64] to be used by the insert operation. 2169 /// \param __b 2170 /// A 64-bit integer value. The replacement value for the insert operation. 2171 /// \param __imm 2172 /// An immediate integer specifying the index of the vector element to be 2173 /// replaced. 2174 /// \returns A copy of vector \a __a, after replacing its element indexed by 2175 /// \a __imm with \a __b. 2176 static __inline __m256i __DEFAULT_FN_ATTRS 2177 _mm256_insert_epi64(__m256i __a, long long __b, int const __imm) 2178 { 2179 __v4di __c = (__v4di)__a; 2180 __c[__imm & 3] = __b; 2181 return (__m256i)__c; 2182 } 2183 #endif 2184 2185 /* Conversion */ 2186 /// \brief Converts a vector of [4 x i32] into a vector of [4 x double]. 2187 /// 2188 /// \headerfile <x86intrin.h> 2189 /// 2190 /// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction. 2191 /// 2192 /// \param __a 2193 /// A 128-bit integer vector of [4 x i32]. 2194 /// \returns A 256-bit vector of [4 x double] containing the converted values. 2195 static __inline __m256d __DEFAULT_FN_ATTRS 2196 _mm256_cvtepi32_pd(__m128i __a) 2197 { 2198 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); 2199 } 2200 2201 /// \brief Converts a vector of [8 x i32] into a vector of [8 x float]. 2202 /// 2203 /// \headerfile <x86intrin.h> 2204 /// 2205 /// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction. 2206 /// 2207 /// \param __a 2208 /// A 256-bit integer vector. 2209 /// \returns A 256-bit vector of [8 x float] containing the converted values. 2210 static __inline __m256 __DEFAULT_FN_ATTRS 2211 _mm256_cvtepi32_ps(__m256i __a) 2212 { 2213 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); 2214 } 2215 2216 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2217 /// [4 x float]. 2218 /// 2219 /// \headerfile <x86intrin.h> 2220 /// 2221 /// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction. 2222 /// 2223 /// \param __a 2224 /// A 256-bit vector of [4 x double]. 2225 /// \returns A 128-bit vector of [4 x float] containing the converted values. 2226 static __inline __m128 __DEFAULT_FN_ATTRS 2227 _mm256_cvtpd_ps(__m256d __a) 2228 { 2229 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 2230 } 2231 2232 /// \brief Converts a vector of [8 x float] into a vector of [8 x i32]. 2233 /// 2234 /// \headerfile <x86intrin.h> 2235 /// 2236 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction. 2237 /// 2238 /// \param __a 2239 /// A 256-bit vector of [8 x float]. 2240 /// \returns A 256-bit integer vector containing the converted values. 2241 static __inline __m256i __DEFAULT_FN_ATTRS 2242 _mm256_cvtps_epi32(__m256 __a) 2243 { 2244 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 2245 } 2246 2247 /// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 2248 /// x double]. 2249 /// 2250 /// \headerfile <x86intrin.h> 2251 /// 2252 /// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction. 2253 /// 2254 /// \param __a 2255 /// A 128-bit vector of [4 x float]. 2256 /// \returns A 256-bit vector of [4 x double] containing the converted values. 2257 static __inline __m256d __DEFAULT_FN_ATTRS 2258 _mm256_cvtps_pd(__m128 __a) 2259 { 2260 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); 2261 } 2262 2263 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2264 /// x i32], truncating the result by rounding towards zero when it is 2265 /// inexact. 2266 /// 2267 /// \headerfile <x86intrin.h> 2268 /// 2269 /// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction. 2270 /// 2271 /// \param __a 2272 /// A 256-bit vector of [4 x double]. 2273 /// \returns A 128-bit integer vector containing the converted values. 2274 static __inline __m128i __DEFAULT_FN_ATTRS 2275 _mm256_cvttpd_epi32(__m256d __a) 2276 { 2277 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 2278 } 2279 2280 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2281 /// x i32]. When a conversion is inexact, the value returned is rounded 2282 /// according to the rounding control bits in the MXCSR register. 2283 /// 2284 /// \headerfile <x86intrin.h> 2285 /// 2286 /// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction. 2287 /// 2288 /// \param __a 2289 /// A 256-bit vector of [4 x double]. 2290 /// \returns A 128-bit integer vector containing the converted values. 2291 static __inline __m128i __DEFAULT_FN_ATTRS 2292 _mm256_cvtpd_epi32(__m256d __a) 2293 { 2294 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 2295 } 2296 2297 /// \brief Converts a vector of [8 x float] into a vector of [8 x i32], 2298 /// truncating the result by rounding towards zero when it is inexact. 2299 /// 2300 /// \headerfile <x86intrin.h> 2301 /// 2302 /// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction. 2303 /// 2304 /// \param __a 2305 /// A 256-bit vector of [8 x float]. 2306 /// \returns A 256-bit integer vector containing the converted values. 2307 static __inline __m256i __DEFAULT_FN_ATTRS 2308 _mm256_cvttps_epi32(__m256 __a) 2309 { 2310 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 2311 } 2312 2313 /// \brief Returns the first element of the input vector of [4 x double]. 2314 /// 2315 /// \headerfile <avxintrin.h> 2316 /// 2317 /// This intrinsic is a utility function and does not correspond to a specific 2318 /// instruction. 2319 /// 2320 /// \param __a 2321 /// A 256-bit vector of [4 x double]. 2322 /// \returns A 64 bit double containing the first element of the input vector. 2323 static __inline double __DEFAULT_FN_ATTRS 2324 _mm256_cvtsd_f64(__m256d __a) 2325 { 2326 return __a[0]; 2327 } 2328 2329 /// \brief Returns the first element of the input vector of [8 x i32]. 2330 /// 2331 /// \headerfile <avxintrin.h> 2332 /// 2333 /// This intrinsic is a utility function and does not correspond to a specific 2334 /// instruction. 2335 /// 2336 /// \param __a 2337 /// A 256-bit vector of [8 x i32]. 2338 /// \returns A 32 bit integer containing the first element of the input vector. 2339 static __inline int __DEFAULT_FN_ATTRS 2340 _mm256_cvtsi256_si32(__m256i __a) 2341 { 2342 __v8si __b = (__v8si)__a; 2343 return __b[0]; 2344 } 2345 2346 /// \brief Returns the first element of the input vector of [8 x float]. 2347 /// 2348 /// \headerfile <avxintrin.h> 2349 /// 2350 /// This intrinsic is a utility function and does not correspond to a specific 2351 /// instruction. 2352 /// 2353 /// \param __a 2354 /// A 256-bit vector of [8 x float]. 2355 /// \returns A 32 bit float containing the first element of the input vector. 2356 static __inline float __DEFAULT_FN_ATTRS 2357 _mm256_cvtss_f32(__m256 __a) 2358 { 2359 return __a[0]; 2360 } 2361 2362 /* Vector replicate */ 2363 /// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit 2364 /// vector of [8 x float] to float values in a 256-bit vector of 2365 /// [8 x float]. 2366 /// 2367 /// \headerfile <x86intrin.h> 2368 /// 2369 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 2370 /// 2371 /// \param __a 2372 /// A 256-bit vector of [8 x float]. \n 2373 /// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of 2374 /// the return value. \n 2375 /// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of 2376 /// the return value. \n 2377 /// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the 2378 /// return value. \n 2379 /// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the 2380 /// return value. 2381 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2382 /// values. 2383 static __inline __m256 __DEFAULT_FN_ATTRS 2384 _mm256_movehdup_ps(__m256 __a) 2385 { 2386 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); 2387 } 2388 2389 /// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit 2390 /// vector of [8 x float] to float values in a 256-bit vector of [8 x float]. 2391 /// 2392 /// \headerfile <x86intrin.h> 2393 /// 2394 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 2395 /// 2396 /// \param __a 2397 /// A 256-bit vector of [8 x float]. \n 2398 /// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of 2399 /// the return value. \n 2400 /// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of 2401 /// the return value. \n 2402 /// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the 2403 /// return value. \n 2404 /// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the 2405 /// return value. 2406 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2407 /// values. 2408 static __inline __m256 __DEFAULT_FN_ATTRS 2409 _mm256_moveldup_ps(__m256 __a) 2410 { 2411 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); 2412 } 2413 2414 /// \brief Moves and duplicates double-precision floating point values from a 2415 /// 256-bit vector of [4 x double] to double-precision values in a 256-bit 2416 /// vector of [4 x double]. 2417 /// 2418 /// \headerfile <x86intrin.h> 2419 /// 2420 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 2421 /// 2422 /// \param __a 2423 /// A 256-bit vector of [4 x double]. \n 2424 /// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the 2425 /// return value. \n 2426 /// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of 2427 /// the return value. 2428 /// \returns A 256-bit vector of [4 x double] containing the moved and 2429 /// duplicated values. 2430 static __inline __m256d __DEFAULT_FN_ATTRS 2431 _mm256_movedup_pd(__m256d __a) 2432 { 2433 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); 2434 } 2435 2436 /* Unpack and Interleave */ 2437 /// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of 2438 /// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2439 /// 2440 /// \headerfile <x86intrin.h> 2441 /// 2442 /// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction. 2443 /// 2444 /// \param __a 2445 /// A 256-bit floating-point vector of [4 x double]. \n 2446 /// Bits [127:64] are written to bits [63:0] of the return value. \n 2447 /// Bits [255:192] are written to bits [191:128] of the return value. \n 2448 /// \param __b 2449 /// A 256-bit floating-point vector of [4 x double]. \n 2450 /// Bits [127:64] are written to bits [127:64] of the return value. \n 2451 /// Bits [255:192] are written to bits [255:192] of the return value. \n 2452 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2453 static __inline __m256d __DEFAULT_FN_ATTRS 2454 _mm256_unpackhi_pd(__m256d __a, __m256d __b) 2455 { 2456 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); 2457 } 2458 2459 /// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of 2460 /// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2461 /// 2462 /// \headerfile <x86intrin.h> 2463 /// 2464 /// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction. 2465 /// 2466 /// \param __a 2467 /// A 256-bit floating-point vector of [4 x double]. \n 2468 /// Bits [63:0] are written to bits [63:0] of the return value. \n 2469 /// Bits [191:128] are written to bits [191:128] of the return value. 2470 /// \param __b 2471 /// A 256-bit floating-point vector of [4 x double]. \n 2472 /// Bits [63:0] are written to bits [127:64] of the return value. \n 2473 /// Bits [191:128] are written to bits [255:192] of the return value. \n 2474 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2475 static __inline __m256d __DEFAULT_FN_ATTRS 2476 _mm256_unpacklo_pd(__m256d __a, __m256d __b) 2477 { 2478 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); 2479 } 2480 2481 /// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the 2482 /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2483 /// vector of [8 x float]. 2484 /// 2485 /// \headerfile <x86intrin.h> 2486 /// 2487 /// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction. 2488 /// 2489 /// \param __a 2490 /// A 256-bit vector of [8 x float]. \n 2491 /// Bits [95:64] are written to bits [31:0] of the return value. \n 2492 /// Bits [127:96] are written to bits [95:64] of the return value. \n 2493 /// Bits [223:192] are written to bits [159:128] of the return value. \n 2494 /// Bits [255:224] are written to bits [223:192] of the return value. 2495 /// \param __b 2496 /// A 256-bit vector of [8 x float]. \n 2497 /// Bits [95:64] are written to bits [63:32] of the return value. \n 2498 /// Bits [127:96] are written to bits [127:96] of the return value. \n 2499 /// Bits [223:192] are written to bits [191:160] of the return value. \n 2500 /// Bits [255:224] are written to bits [255:224] of the return value. 2501 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2502 static __inline __m256 __DEFAULT_FN_ATTRS 2503 _mm256_unpackhi_ps(__m256 __a, __m256 __b) 2504 { 2505 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 2506 } 2507 2508 /// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the 2509 /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2510 /// vector of [8 x float]. 2511 /// 2512 /// \headerfile <x86intrin.h> 2513 /// 2514 /// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction. 2515 /// 2516 /// \param __a 2517 /// A 256-bit vector of [8 x float]. \n 2518 /// Bits [31:0] are written to bits [31:0] of the return value. \n 2519 /// Bits [63:32] are written to bits [95:64] of the return value. \n 2520 /// Bits [159:128] are written to bits [159:128] of the return value. \n 2521 /// Bits [191:160] are written to bits [223:192] of the return value. 2522 /// \param __b 2523 /// A 256-bit vector of [8 x float]. \n 2524 /// Bits [31:0] are written to bits [63:32] of the return value. \n 2525 /// Bits [63:32] are written to bits [127:96] of the return value. \n 2526 /// Bits [159:128] are written to bits [191:160] of the return value. \n 2527 /// Bits [191:160] are written to bits [255:224] of the return value. 2528 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2529 static __inline __m256 __DEFAULT_FN_ATTRS 2530 _mm256_unpacklo_ps(__m256 __a, __m256 __b) 2531 { 2532 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 2533 } 2534 2535 /* Bit Test */ 2536 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2537 /// element-by-element comparison of the double-precision element in the 2538 /// first source vector and the corresponding element in the second source 2539 /// vector. The EFLAGS register is updated as follows: \n 2540 /// If there is at least one pair of double-precision elements where the 2541 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2542 /// ZF flag is set to 1. \n 2543 /// If there is at least one pair of double-precision elements where the 2544 /// sign-bit of the first element is 0 and the sign-bit of the second element 2545 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2546 /// This intrinsic returns the value of the ZF flag. 2547 /// 2548 /// \headerfile <x86intrin.h> 2549 /// 2550 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2551 /// 2552 /// \param __a 2553 /// A 128-bit vector of [2 x double]. 2554 /// \param __b 2555 /// A 128-bit vector of [2 x double]. 2556 /// \returns the ZF flag in the EFLAGS register. 2557 static __inline int __DEFAULT_FN_ATTRS 2558 _mm_testz_pd(__m128d __a, __m128d __b) 2559 { 2560 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 2561 } 2562 2563 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2564 /// element-by-element comparison of the double-precision element in the 2565 /// first source vector and the corresponding element in the second source 2566 /// vector. The EFLAGS register is updated as follows: \n 2567 /// If there is at least one pair of double-precision elements where the 2568 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2569 /// ZF flag is set to 1. \n 2570 /// If there is at least one pair of double-precision elements where the 2571 /// sign-bit of the first element is 0 and the sign-bit of the second element 2572 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2573 /// This intrinsic returns the value of the CF flag. 2574 /// 2575 /// \headerfile <x86intrin.h> 2576 /// 2577 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2578 /// 2579 /// \param __a 2580 /// A 128-bit vector of [2 x double]. 2581 /// \param __b 2582 /// A 128-bit vector of [2 x double]. 2583 /// \returns the CF flag in the EFLAGS register. 2584 static __inline int __DEFAULT_FN_ATTRS 2585 _mm_testc_pd(__m128d __a, __m128d __b) 2586 { 2587 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 2588 } 2589 2590 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2591 /// element-by-element comparison of the double-precision element in the 2592 /// first source vector and the corresponding element in the second source 2593 /// vector. The EFLAGS register is updated as follows: \n 2594 /// If there is at least one pair of double-precision elements where the 2595 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2596 /// ZF flag is set to 1. \n 2597 /// If there is at least one pair of double-precision elements where the 2598 /// sign-bit of the first element is 0 and the sign-bit of the second element 2599 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2600 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2601 /// otherwise it returns 0. 2602 /// 2603 /// \headerfile <x86intrin.h> 2604 /// 2605 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2606 /// 2607 /// \param __a 2608 /// A 128-bit vector of [2 x double]. 2609 /// \param __b 2610 /// A 128-bit vector of [2 x double]. 2611 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2612 static __inline int __DEFAULT_FN_ATTRS 2613 _mm_testnzc_pd(__m128d __a, __m128d __b) 2614 { 2615 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 2616 } 2617 2618 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2619 /// element-by-element comparison of the single-precision element in the 2620 /// first source vector and the corresponding element in the second source 2621 /// vector. The EFLAGS register is updated as follows: \n 2622 /// If there is at least one pair of single-precision elements where the 2623 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2624 /// ZF flag is set to 1. \n 2625 /// If there is at least one pair of single-precision elements where the 2626 /// sign-bit of the first element is 0 and the sign-bit of the second element 2627 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2628 /// This intrinsic returns the value of the ZF flag. 2629 /// 2630 /// \headerfile <x86intrin.h> 2631 /// 2632 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2633 /// 2634 /// \param __a 2635 /// A 128-bit vector of [4 x float]. 2636 /// \param __b 2637 /// A 128-bit vector of [4 x float]. 2638 /// \returns the ZF flag. 2639 static __inline int __DEFAULT_FN_ATTRS 2640 _mm_testz_ps(__m128 __a, __m128 __b) 2641 { 2642 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 2643 } 2644 2645 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2646 /// element-by-element comparison of the single-precision element in the 2647 /// first source vector and the corresponding element in the second source 2648 /// vector. The EFLAGS register is updated as follows: \n 2649 /// If there is at least one pair of single-precision elements where the 2650 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2651 /// ZF flag is set to 1. \n 2652 /// If there is at least one pair of single-precision elements where the 2653 /// sign-bit of the first element is 0 and the sign-bit of the second element 2654 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2655 /// This intrinsic returns the value of the CF flag. 2656 /// 2657 /// \headerfile <x86intrin.h> 2658 /// 2659 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2660 /// 2661 /// \param __a 2662 /// A 128-bit vector of [4 x float]. 2663 /// \param __b 2664 /// A 128-bit vector of [4 x float]. 2665 /// \returns the CF flag. 2666 static __inline int __DEFAULT_FN_ATTRS 2667 _mm_testc_ps(__m128 __a, __m128 __b) 2668 { 2669 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 2670 } 2671 2672 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2673 /// element-by-element comparison of the single-precision element in the 2674 /// first source vector and the corresponding element in the second source 2675 /// vector. The EFLAGS register is updated as follows: \n 2676 /// If there is at least one pair of single-precision elements where the 2677 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2678 /// ZF flag is set to 1. \n 2679 /// If there is at least one pair of single-precision elements where the 2680 /// sign-bit of the first element is 0 and the sign-bit of the second element 2681 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2682 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2683 /// otherwise it returns 0. 2684 /// 2685 /// \headerfile <x86intrin.h> 2686 /// 2687 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2688 /// 2689 /// \param __a 2690 /// A 128-bit vector of [4 x float]. 2691 /// \param __b 2692 /// A 128-bit vector of [4 x float]. 2693 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2694 static __inline int __DEFAULT_FN_ATTRS 2695 _mm_testnzc_ps(__m128 __a, __m128 __b) 2696 { 2697 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 2698 } 2699 2700 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2701 /// element-by-element comparison of the double-precision elements in the 2702 /// first source vector and the corresponding elements in the second source 2703 /// vector. The EFLAGS register is updated as follows: \n 2704 /// If there is at least one pair of double-precision elements where the 2705 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2706 /// ZF flag is set to 1. \n 2707 /// If there is at least one pair of double-precision elements where the 2708 /// sign-bit of the first element is 0 and the sign-bit of the second element 2709 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2710 /// This intrinsic returns the value of the ZF flag. 2711 /// 2712 /// \headerfile <x86intrin.h> 2713 /// 2714 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2715 /// 2716 /// \param __a 2717 /// A 256-bit vector of [4 x double]. 2718 /// \param __b 2719 /// A 256-bit vector of [4 x double]. 2720 /// \returns the ZF flag. 2721 static __inline int __DEFAULT_FN_ATTRS 2722 _mm256_testz_pd(__m256d __a, __m256d __b) 2723 { 2724 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 2725 } 2726 2727 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2728 /// element-by-element comparison of the double-precision elements in the 2729 /// first source vector and the corresponding elements in the second source 2730 /// vector. The EFLAGS register is updated as follows: \n 2731 /// If there is at least one pair of double-precision elements where the 2732 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2733 /// ZF flag is set to 1. \n 2734 /// If there is at least one pair of double-precision elements where the 2735 /// sign-bit of the first element is 0 and the sign-bit of the second element 2736 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2737 /// This intrinsic returns the value of the CF flag. 2738 /// 2739 /// \headerfile <x86intrin.h> 2740 /// 2741 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2742 /// 2743 /// \param __a 2744 /// A 256-bit vector of [4 x double]. 2745 /// \param __b 2746 /// A 256-bit vector of [4 x double]. 2747 /// \returns the CF flag. 2748 static __inline int __DEFAULT_FN_ATTRS 2749 _mm256_testc_pd(__m256d __a, __m256d __b) 2750 { 2751 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 2752 } 2753 2754 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2755 /// element-by-element comparison of the double-precision elements in the 2756 /// first source vector and the corresponding elements in the second source 2757 /// vector. The EFLAGS register is updated as follows: \n 2758 /// If there is at least one pair of double-precision elements where the 2759 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2760 /// ZF flag is set to 1. \n 2761 /// If there is at least one pair of double-precision elements where the 2762 /// sign-bit of the first element is 0 and the sign-bit of the second element 2763 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2764 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2765 /// otherwise it returns 0. 2766 /// 2767 /// \headerfile <x86intrin.h> 2768 /// 2769 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2770 /// 2771 /// \param __a 2772 /// A 256-bit vector of [4 x double]. 2773 /// \param __b 2774 /// A 256-bit vector of [4 x double]. 2775 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2776 static __inline int __DEFAULT_FN_ATTRS 2777 _mm256_testnzc_pd(__m256d __a, __m256d __b) 2778 { 2779 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 2780 } 2781 2782 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2783 /// element-by-element comparison of the single-precision element in the 2784 /// first source vector and the corresponding element in the second source 2785 /// vector. The EFLAGS register is updated as follows: \n 2786 /// If there is at least one pair of single-precision elements where the 2787 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2788 /// ZF flag is set to 1. \n 2789 /// If there is at least one pair of single-precision elements where the 2790 /// sign-bit of the first element is 0 and the sign-bit of the second element 2791 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2792 /// This intrinsic returns the value of the ZF flag. 2793 /// 2794 /// \headerfile <x86intrin.h> 2795 /// 2796 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2797 /// 2798 /// \param __a 2799 /// A 256-bit vector of [8 x float]. 2800 /// \param __b 2801 /// A 256-bit vector of [8 x float]. 2802 /// \returns the ZF flag. 2803 static __inline int __DEFAULT_FN_ATTRS 2804 _mm256_testz_ps(__m256 __a, __m256 __b) 2805 { 2806 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 2807 } 2808 2809 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2810 /// element-by-element comparison of the single-precision element in the 2811 /// first source vector and the corresponding element in the second source 2812 /// vector. The EFLAGS register is updated as follows: \n 2813 /// If there is at least one pair of single-precision elements where the 2814 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2815 /// ZF flag is set to 1. \n 2816 /// If there is at least one pair of single-precision elements where the 2817 /// sign-bit of the first element is 0 and the sign-bit of the second element 2818 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2819 /// This intrinsic returns the value of the CF flag. 2820 /// 2821 /// \headerfile <x86intrin.h> 2822 /// 2823 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2824 /// 2825 /// \param __a 2826 /// A 256-bit vector of [8 x float]. 2827 /// \param __b 2828 /// A 256-bit vector of [8 x float]. 2829 /// \returns the CF flag. 2830 static __inline int __DEFAULT_FN_ATTRS 2831 _mm256_testc_ps(__m256 __a, __m256 __b) 2832 { 2833 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 2834 } 2835 2836 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2837 /// element-by-element comparison of the single-precision elements in the 2838 /// first source vector and the corresponding elements in the second source 2839 /// vector. The EFLAGS register is updated as follows: \n 2840 /// If there is at least one pair of single-precision elements where the 2841 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2842 /// ZF flag is set to 1. \n 2843 /// If there is at least one pair of single-precision elements where the 2844 /// sign-bit of the first element is 0 and the sign-bit of the second element 2845 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2846 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2847 /// otherwise it returns 0. 2848 /// 2849 /// \headerfile <x86intrin.h> 2850 /// 2851 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2852 /// 2853 /// \param __a 2854 /// A 256-bit vector of [8 x float]. 2855 /// \param __b 2856 /// A 256-bit vector of [8 x float]. 2857 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2858 static __inline int __DEFAULT_FN_ATTRS 2859 _mm256_testnzc_ps(__m256 __a, __m256 __b) 2860 { 2861 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 2862 } 2863 2864 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2865 /// of the two source vectors and update the EFLAGS register as follows: \n 2866 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2867 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2868 /// If there is at least one pair of bits where the bit from the first source 2869 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2870 /// is set to 0. Otherwise the CF flag is set to 1. \n 2871 /// This intrinsic returns the value of the ZF flag. 2872 /// 2873 /// \headerfile <x86intrin.h> 2874 /// 2875 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2876 /// 2877 /// \param __a 2878 /// A 256-bit integer vector. 2879 /// \param __b 2880 /// A 256-bit integer vector. 2881 /// \returns the ZF flag. 2882 static __inline int __DEFAULT_FN_ATTRS 2883 _mm256_testz_si256(__m256i __a, __m256i __b) 2884 { 2885 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 2886 } 2887 2888 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2889 /// of the two source vectors and update the EFLAGS register as follows: \n 2890 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2891 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2892 /// If there is at least one pair of bits where the bit from the first source 2893 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2894 /// is set to 0. Otherwise the CF flag is set to 1. \n 2895 /// This intrinsic returns the value of the CF flag. 2896 /// 2897 /// \headerfile <x86intrin.h> 2898 /// 2899 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2900 /// 2901 /// \param __a 2902 /// A 256-bit integer vector. 2903 /// \param __b 2904 /// A 256-bit integer vector. 2905 /// \returns the CF flag. 2906 static __inline int __DEFAULT_FN_ATTRS 2907 _mm256_testc_si256(__m256i __a, __m256i __b) 2908 { 2909 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 2910 } 2911 2912 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2913 /// of the two source vectors and update the EFLAGS register as follows: \n 2914 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2915 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2916 /// If there is at least one pair of bits where the bit from the first source 2917 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2918 /// is set to 0. Otherwise the CF flag is set to 1. \n 2919 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2920 /// otherwise it returns 0. 2921 /// 2922 /// \headerfile <x86intrin.h> 2923 /// 2924 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2925 /// 2926 /// \param __a 2927 /// A 256-bit integer vector. 2928 /// \param __b 2929 /// A 256-bit integer vector. 2930 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2931 static __inline int __DEFAULT_FN_ATTRS 2932 _mm256_testnzc_si256(__m256i __a, __m256i __b) 2933 { 2934 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 2935 } 2936 2937 /* Vector extract sign mask */ 2938 /// \brief Extracts the sign bits of double-precision floating point elements 2939 /// in a 256-bit vector of [4 x double] and writes them to the lower order 2940 /// bits of the return value. 2941 /// 2942 /// \headerfile <x86intrin.h> 2943 /// 2944 /// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction. 2945 /// 2946 /// \param __a 2947 /// A 256-bit vector of [4 x double] containing the double-precision 2948 /// floating point values with sign bits to be extracted. 2949 /// \returns The sign bits from the operand, written to bits [3:0]. 2950 static __inline int __DEFAULT_FN_ATTRS 2951 _mm256_movemask_pd(__m256d __a) 2952 { 2953 return __builtin_ia32_movmskpd256((__v4df)__a); 2954 } 2955 2956 /// \brief Extracts the sign bits of double-precision floating point elements 2957 /// in a 256-bit vector of [8 x float] and writes them to the lower order 2958 /// bits of the return value. 2959 /// 2960 /// \headerfile <x86intrin.h> 2961 /// 2962 /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. 2963 /// 2964 /// \param __a 2965 /// A 256-bit vector of [8 x float] containing the double-precision floating 2966 /// point values with sign bits to be extracted. 2967 /// \returns The sign bits from the operand, written to bits [7:0]. 2968 static __inline int __DEFAULT_FN_ATTRS 2969 _mm256_movemask_ps(__m256 __a) 2970 { 2971 return __builtin_ia32_movmskps256((__v8sf)__a); 2972 } 2973 2974 /* Vector __zero */ 2975 /// \brief Zeroes the contents of all XMM or YMM registers. 2976 /// 2977 /// \headerfile <x86intrin.h> 2978 /// 2979 /// This intrinsic corresponds to the <c> VZEROALL </c> instruction. 2980 static __inline void __DEFAULT_FN_ATTRS 2981 _mm256_zeroall(void) 2982 { 2983 __builtin_ia32_vzeroall(); 2984 } 2985 2986 /// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers. 2987 /// 2988 /// \headerfile <x86intrin.h> 2989 /// 2990 /// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. 2991 static __inline void __DEFAULT_FN_ATTRS 2992 _mm256_zeroupper(void) 2993 { 2994 __builtin_ia32_vzeroupper(); 2995 } 2996 2997 /* Vector load with broadcast */ 2998 /// \brief Loads a scalar single-precision floating point value from the 2999 /// specified address pointed to by \a __a and broadcasts it to the elements 3000 /// of a [4 x float] vector. 3001 /// 3002 /// \headerfile <x86intrin.h> 3003 /// 3004 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3005 /// 3006 /// \param __a 3007 /// The single-precision floating point value to be broadcast. 3008 /// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set 3009 /// equal to the broadcast value. 3010 static __inline __m128 __DEFAULT_FN_ATTRS 3011 _mm_broadcast_ss(float const *__a) 3012 { 3013 float __f = *__a; 3014 return (__m128)(__v4sf){ __f, __f, __f, __f }; 3015 } 3016 3017 /// \brief Loads a scalar double-precision floating point value from the 3018 /// specified address pointed to by \a __a and broadcasts it to the elements 3019 /// of a [4 x double] vector. 3020 /// 3021 /// \headerfile <x86intrin.h> 3022 /// 3023 /// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction. 3024 /// 3025 /// \param __a 3026 /// The double-precision floating point value to be broadcast. 3027 /// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set 3028 /// equal to the broadcast value. 3029 static __inline __m256d __DEFAULT_FN_ATTRS 3030 _mm256_broadcast_sd(double const *__a) 3031 { 3032 double __d = *__a; 3033 return (__m256d)(__v4df){ __d, __d, __d, __d }; 3034 } 3035 3036 /// \brief Loads a scalar single-precision floating point value from the 3037 /// specified address pointed to by \a __a and broadcasts it to the elements 3038 /// of a [8 x float] vector. 3039 /// 3040 /// \headerfile <x86intrin.h> 3041 /// 3042 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3043 /// 3044 /// \param __a 3045 /// The single-precision floating point value to be broadcast. 3046 /// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set 3047 /// equal to the broadcast value. 3048 static __inline __m256 __DEFAULT_FN_ATTRS 3049 _mm256_broadcast_ss(float const *__a) 3050 { 3051 float __f = *__a; 3052 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 3053 } 3054 3055 /// \brief Loads the data from a 128-bit vector of [2 x double] from the 3056 /// specified address pointed to by \a __a and broadcasts it to 128-bit 3057 /// elements in a 256-bit vector of [4 x double]. 3058 /// 3059 /// \headerfile <x86intrin.h> 3060 /// 3061 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3062 /// 3063 /// \param __a 3064 /// The 128-bit vector of [2 x double] to be broadcast. 3065 /// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set 3066 /// equal to the broadcast value. 3067 static __inline __m256d __DEFAULT_FN_ATTRS 3068 _mm256_broadcast_pd(__m128d const *__a) 3069 { 3070 return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a); 3071 } 3072 3073 /// \brief Loads the data from a 128-bit vector of [4 x float] from the 3074 /// specified address pointed to by \a __a and broadcasts it to 128-bit 3075 /// elements in a 256-bit vector of [8 x float]. 3076 /// 3077 /// \headerfile <x86intrin.h> 3078 /// 3079 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3080 /// 3081 /// \param __a 3082 /// The 128-bit vector of [4 x float] to be broadcast. 3083 /// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set 3084 /// equal to the broadcast value. 3085 static __inline __m256 __DEFAULT_FN_ATTRS 3086 _mm256_broadcast_ps(__m128 const *__a) 3087 { 3088 return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a); 3089 } 3090 3091 /* SIMD load ops */ 3092 /// \brief Loads 4 double-precision floating point values from a 32-byte aligned 3093 /// memory location pointed to by \a __p into a vector of [4 x double]. 3094 /// 3095 /// \headerfile <x86intrin.h> 3096 /// 3097 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3098 /// 3099 /// \param __p 3100 /// A 32-byte aligned pointer to a memory location containing 3101 /// double-precision floating point values. 3102 /// \returns A 256-bit vector of [4 x double] containing the moved values. 3103 static __inline __m256d __DEFAULT_FN_ATTRS 3104 _mm256_load_pd(double const *__p) 3105 { 3106 return *(__m256d *)__p; 3107 } 3108 3109 /// \brief Loads 8 single-precision floating point values from a 32-byte aligned 3110 /// memory location pointed to by \a __p into a vector of [8 x float]. 3111 /// 3112 /// \headerfile <x86intrin.h> 3113 /// 3114 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3115 /// 3116 /// \param __p 3117 /// A 32-byte aligned pointer to a memory location containing float values. 3118 /// \returns A 256-bit vector of [8 x float] containing the moved values. 3119 static __inline __m256 __DEFAULT_FN_ATTRS 3120 _mm256_load_ps(float const *__p) 3121 { 3122 return *(__m256 *)__p; 3123 } 3124 3125 /// \brief Loads 4 double-precision floating point values from an unaligned 3126 /// memory location pointed to by \a __p into a vector of [4 x double]. 3127 /// 3128 /// \headerfile <x86intrin.h> 3129 /// 3130 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3131 /// 3132 /// \param __p 3133 /// A pointer to a memory location containing double-precision floating 3134 /// point values. 3135 /// \returns A 256-bit vector of [4 x double] containing the moved values. 3136 static __inline __m256d __DEFAULT_FN_ATTRS 3137 _mm256_loadu_pd(double const *__p) 3138 { 3139 struct __loadu_pd { 3140 __m256d __v; 3141 } __attribute__((__packed__, __may_alias__)); 3142 return ((struct __loadu_pd*)__p)->__v; 3143 } 3144 3145 /// \brief Loads 8 single-precision floating point values from an unaligned 3146 /// memory location pointed to by \a __p into a vector of [8 x float]. 3147 /// 3148 /// \headerfile <x86intrin.h> 3149 /// 3150 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3151 /// 3152 /// \param __p 3153 /// A pointer to a memory location containing single-precision floating 3154 /// point values. 3155 /// \returns A 256-bit vector of [8 x float] containing the moved values. 3156 static __inline __m256 __DEFAULT_FN_ATTRS 3157 _mm256_loadu_ps(float const *__p) 3158 { 3159 struct __loadu_ps { 3160 __m256 __v; 3161 } __attribute__((__packed__, __may_alias__)); 3162 return ((struct __loadu_ps*)__p)->__v; 3163 } 3164 3165 /// \brief Loads 256 bits of integer data from a 32-byte aligned memory 3166 /// location pointed to by \a __p into elements of a 256-bit integer vector. 3167 /// 3168 /// \headerfile <x86intrin.h> 3169 /// 3170 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3171 /// 3172 /// \param __p 3173 /// A 32-byte aligned pointer to a 256-bit integer vector containing integer 3174 /// values. 3175 /// \returns A 256-bit integer vector containing the moved values. 3176 static __inline __m256i __DEFAULT_FN_ATTRS 3177 _mm256_load_si256(__m256i const *__p) 3178 { 3179 return *__p; 3180 } 3181 3182 /// \brief Loads 256 bits of integer data from an unaligned memory location 3183 /// pointed to by \a __p into a 256-bit integer vector. 3184 /// 3185 /// \headerfile <x86intrin.h> 3186 /// 3187 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3188 /// 3189 /// \param __p 3190 /// A pointer to a 256-bit integer vector containing integer values. 3191 /// \returns A 256-bit integer vector containing the moved values. 3192 static __inline __m256i __DEFAULT_FN_ATTRS 3193 _mm256_loadu_si256(__m256i const *__p) 3194 { 3195 struct __loadu_si256 { 3196 __m256i __v; 3197 } __attribute__((__packed__, __may_alias__)); 3198 return ((struct __loadu_si256*)__p)->__v; 3199 } 3200 3201 /// \brief Loads 256 bits of integer data from an unaligned memory location 3202 /// pointed to by \a __p into a 256-bit integer vector. This intrinsic may 3203 /// perform better than \c _mm256_loadu_si256 when the data crosses a cache 3204 /// line boundary. 3205 /// 3206 /// \headerfile <x86intrin.h> 3207 /// 3208 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 3209 /// 3210 /// \param __p 3211 /// A pointer to a 256-bit integer vector containing integer values. 3212 /// \returns A 256-bit integer vector containing the moved values. 3213 static __inline __m256i __DEFAULT_FN_ATTRS 3214 _mm256_lddqu_si256(__m256i const *__p) 3215 { 3216 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 3217 } 3218 3219 /* SIMD store ops */ 3220 /// \brief Stores double-precision floating point values from a 256-bit vector 3221 /// of [4 x double] to a 32-byte aligned memory location pointed to by 3222 /// \a __p. 3223 /// 3224 /// \headerfile <x86intrin.h> 3225 /// 3226 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3227 /// 3228 /// \param __p 3229 /// A 32-byte aligned pointer to a memory location that will receive the 3230 /// double-precision floaing point values. 3231 /// \param __a 3232 /// A 256-bit vector of [4 x double] containing the values to be moved. 3233 static __inline void __DEFAULT_FN_ATTRS 3234 _mm256_store_pd(double *__p, __m256d __a) 3235 { 3236 *(__m256d *)__p = __a; 3237 } 3238 3239 /// \brief Stores single-precision floating point values from a 256-bit vector 3240 /// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. 3241 /// 3242 /// \headerfile <x86intrin.h> 3243 /// 3244 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3245 /// 3246 /// \param __p 3247 /// A 32-byte aligned pointer to a memory location that will receive the 3248 /// float values. 3249 /// \param __a 3250 /// A 256-bit vector of [8 x float] containing the values to be moved. 3251 static __inline void __DEFAULT_FN_ATTRS 3252 _mm256_store_ps(float *__p, __m256 __a) 3253 { 3254 *(__m256 *)__p = __a; 3255 } 3256 3257 /// \brief Stores double-precision floating point values from a 256-bit vector 3258 /// of [4 x double] to an unaligned memory location pointed to by \a __p. 3259 /// 3260 /// \headerfile <x86intrin.h> 3261 /// 3262 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3263 /// 3264 /// \param __p 3265 /// A pointer to a memory location that will receive the double-precision 3266 /// floating point values. 3267 /// \param __a 3268 /// A 256-bit vector of [4 x double] containing the values to be moved. 3269 static __inline void __DEFAULT_FN_ATTRS 3270 _mm256_storeu_pd(double *__p, __m256d __a) 3271 { 3272 struct __storeu_pd { 3273 __m256d __v; 3274 } __attribute__((__packed__, __may_alias__)); 3275 ((struct __storeu_pd*)__p)->__v = __a; 3276 } 3277 3278 /// \brief Stores single-precision floating point values from a 256-bit vector 3279 /// of [8 x float] to an unaligned memory location pointed to by \a __p. 3280 /// 3281 /// \headerfile <x86intrin.h> 3282 /// 3283 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3284 /// 3285 /// \param __p 3286 /// A pointer to a memory location that will receive the float values. 3287 /// \param __a 3288 /// A 256-bit vector of [8 x float] containing the values to be moved. 3289 static __inline void __DEFAULT_FN_ATTRS 3290 _mm256_storeu_ps(float *__p, __m256 __a) 3291 { 3292 struct __storeu_ps { 3293 __m256 __v; 3294 } __attribute__((__packed__, __may_alias__)); 3295 ((struct __storeu_ps*)__p)->__v = __a; 3296 } 3297 3298 /// \brief Stores integer values from a 256-bit integer vector to a 32-byte 3299 /// aligned memory location pointed to by \a __p. 3300 /// 3301 /// \headerfile <x86intrin.h> 3302 /// 3303 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3304 /// 3305 /// \param __p 3306 /// A 32-byte aligned pointer to a memory location that will receive the 3307 /// integer values. 3308 /// \param __a 3309 /// A 256-bit integer vector containing the values to be moved. 3310 static __inline void __DEFAULT_FN_ATTRS 3311 _mm256_store_si256(__m256i *__p, __m256i __a) 3312 { 3313 *__p = __a; 3314 } 3315 3316 /// \brief Stores integer values from a 256-bit integer vector to an unaligned 3317 /// memory location pointed to by \a __p. 3318 /// 3319 /// \headerfile <x86intrin.h> 3320 /// 3321 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3322 /// 3323 /// \param __p 3324 /// A pointer to a memory location that will receive the integer values. 3325 /// \param __a 3326 /// A 256-bit integer vector containing the values to be moved. 3327 static __inline void __DEFAULT_FN_ATTRS 3328 _mm256_storeu_si256(__m256i *__p, __m256i __a) 3329 { 3330 struct __storeu_si256 { 3331 __m256i __v; 3332 } __attribute__((__packed__, __may_alias__)); 3333 ((struct __storeu_si256*)__p)->__v = __a; 3334 } 3335 3336 /* Conditional load ops */ 3337 /// \brief Conditionally loads double-precision floating point elements from a 3338 /// memory location pointed to by \a __p into a 128-bit vector of 3339 /// [2 x double], depending on the mask bits associated with each data 3340 /// element. 3341 /// 3342 /// \headerfile <x86intrin.h> 3343 /// 3344 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3345 /// 3346 /// \param __p 3347 /// A pointer to a memory location that contains the double-precision 3348 /// floating point values. 3349 /// \param __m 3350 /// A 128-bit integer vector containing the mask. The most significant bit of 3351 /// each data element represents the mask bits. If a mask bit is zero, the 3352 /// corresponding value in the memory location is not loaded and the 3353 /// corresponding field in the return value is set to zero. 3354 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 3355 static __inline __m128d __DEFAULT_FN_ATTRS 3356 _mm_maskload_pd(double const *__p, __m128i __m) 3357 { 3358 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 3359 } 3360 3361 /// \brief Conditionally loads double-precision floating point elements from a 3362 /// memory location pointed to by \a __p into a 256-bit vector of 3363 /// [4 x double], depending on the mask bits associated with each data 3364 /// element. 3365 /// 3366 /// \headerfile <x86intrin.h> 3367 /// 3368 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3369 /// 3370 /// \param __p 3371 /// A pointer to a memory location that contains the double-precision 3372 /// floating point values. 3373 /// \param __m 3374 /// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3375 /// significant bit of each quadword element represents the mask bits. If a 3376 /// mask bit is zero, the corresponding value in the memory location is not 3377 /// loaded and the corresponding field in the return value is set to zero. 3378 /// \returns A 256-bit vector of [4 x double] containing the loaded values. 3379 static __inline __m256d __DEFAULT_FN_ATTRS 3380 _mm256_maskload_pd(double const *__p, __m256i __m) 3381 { 3382 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 3383 (__v4di)__m); 3384 } 3385 3386 /// \brief Conditionally loads single-precision floating point elements from a 3387 /// memory location pointed to by \a __p into a 128-bit vector of 3388 /// [4 x float], depending on the mask bits associated with each data 3389 /// element. 3390 /// 3391 /// \headerfile <x86intrin.h> 3392 /// 3393 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3394 /// 3395 /// \param __p 3396 /// A pointer to a memory location that contains the single-precision 3397 /// floating point values. 3398 /// \param __m 3399 /// A 128-bit integer vector containing the mask. The most significant bit of 3400 /// each data element represents the mask bits. If a mask bit is zero, the 3401 /// corresponding value in the memory location is not loaded and the 3402 /// corresponding field in the return value is set to zero. 3403 /// \returns A 128-bit vector of [4 x float] containing the loaded values. 3404 static __inline __m128 __DEFAULT_FN_ATTRS 3405 _mm_maskload_ps(float const *__p, __m128i __m) 3406 { 3407 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 3408 } 3409 3410 /// \brief Conditionally loads single-precision floating point elements from a 3411 /// memory location pointed to by \a __p into a 256-bit vector of 3412 /// [8 x float], depending on the mask bits associated with each data 3413 /// element. 3414 /// 3415 /// \headerfile <x86intrin.h> 3416 /// 3417 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3418 /// 3419 /// \param __p 3420 /// A pointer to a memory location that contains the single-precision 3421 /// floating point values. 3422 /// \param __m 3423 /// A 256-bit integer vector of [8 x dword] containing the mask. The most 3424 /// significant bit of each dword element represents the mask bits. If a mask 3425 /// bit is zero, the corresponding value in the memory location is not loaded 3426 /// and the corresponding field in the return value is set to zero. 3427 /// \returns A 256-bit vector of [8 x float] containing the loaded values. 3428 static __inline __m256 __DEFAULT_FN_ATTRS 3429 _mm256_maskload_ps(float const *__p, __m256i __m) 3430 { 3431 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 3432 } 3433 3434 /* Conditional store ops */ 3435 /// \brief Moves single-precision floating point values from a 256-bit vector 3436 /// of [8 x float] to a memory location pointed to by \a __p, according to 3437 /// the specified mask. 3438 /// 3439 /// \headerfile <x86intrin.h> 3440 /// 3441 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3442 /// 3443 /// \param __p 3444 /// A pointer to a memory location that will receive the float values. 3445 /// \param __m 3446 /// A 256-bit integer vector of [8 x dword] containing the mask. The most 3447 /// significant bit of each dword element in the mask vector represents the 3448 /// mask bits. If a mask bit is zero, the corresponding value from vector 3449 /// \a __a is not stored and the corresponding field in the memory location 3450 /// pointed to by \a __p is not changed. 3451 /// \param __a 3452 /// A 256-bit vector of [8 x float] containing the values to be stored. 3453 static __inline void __DEFAULT_FN_ATTRS 3454 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 3455 { 3456 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 3457 } 3458 3459 /// \brief Moves double-precision values from a 128-bit vector of [2 x double] 3460 /// to a memory location pointed to by \a __p, according to the specified 3461 /// mask. 3462 /// 3463 /// \headerfile <x86intrin.h> 3464 /// 3465 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3466 /// 3467 /// \param __p 3468 /// A pointer to a memory location that will receive the float values. 3469 /// \param __m 3470 /// A 128-bit integer vector containing the mask. The most significant bit of 3471 /// each field in the mask vector represents the mask bits. If a mask bit is 3472 /// zero, the corresponding value from vector \a __a is not stored and the 3473 /// corresponding field in the memory location pointed to by \a __p is not 3474 /// changed. 3475 /// \param __a 3476 /// A 128-bit vector of [2 x double] containing the values to be stored. 3477 static __inline void __DEFAULT_FN_ATTRS 3478 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 3479 { 3480 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 3481 } 3482 3483 /// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3484 /// to a memory location pointed to by \a __p, according to the specified 3485 /// mask. 3486 /// 3487 /// \headerfile <x86intrin.h> 3488 /// 3489 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3490 /// 3491 /// \param __p 3492 /// A pointer to a memory location that will receive the float values. 3493 /// \param __m 3494 /// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3495 /// significant bit of each quadword element in the mask vector represents 3496 /// the mask bits. If a mask bit is zero, the corresponding value from vector 3497 /// __a is not stored and the corresponding field in the memory location 3498 /// pointed to by \a __p is not changed. 3499 /// \param __a 3500 /// A 256-bit vector of [4 x double] containing the values to be stored. 3501 static __inline void __DEFAULT_FN_ATTRS 3502 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 3503 { 3504 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 3505 } 3506 3507 /// \brief Moves single-precision floating point values from a 128-bit vector 3508 /// of [4 x float] to a memory location pointed to by \a __p, according to 3509 /// the specified mask. 3510 /// 3511 /// \headerfile <x86intrin.h> 3512 /// 3513 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3514 /// 3515 /// \param __p 3516 /// A pointer to a memory location that will receive the float values. 3517 /// \param __m 3518 /// A 128-bit integer vector containing the mask. The most significant bit of 3519 /// each field in the mask vector represents the mask bits. If a mask bit is 3520 /// zero, the corresponding value from vector __a is not stored and the 3521 /// corresponding field in the memory location pointed to by \a __p is not 3522 /// changed. 3523 /// \param __a 3524 /// A 128-bit vector of [4 x float] containing the values to be stored. 3525 static __inline void __DEFAULT_FN_ATTRS 3526 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 3527 { 3528 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 3529 } 3530 3531 /* Cacheability support ops */ 3532 /// \brief Moves integer data from a 256-bit integer vector to a 32-byte 3533 /// aligned memory location. To minimize caching, the data is flagged as 3534 /// non-temporal (unlikely to be used again soon). 3535 /// 3536 /// \headerfile <x86intrin.h> 3537 /// 3538 /// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction. 3539 /// 3540 /// \param __a 3541 /// A pointer to a 32-byte aligned memory location that will receive the 3542 /// integer values. 3543 /// \param __b 3544 /// A 256-bit integer vector containing the values to be moved. 3545 static __inline void __DEFAULT_FN_ATTRS 3546 _mm256_stream_si256(__m256i *__a, __m256i __b) 3547 { 3548 __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a); 3549 } 3550 3551 /// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3552 /// to a 32-byte aligned memory location. To minimize caching, the data is 3553 /// flagged as non-temporal (unlikely to be used again soon). 3554 /// 3555 /// \headerfile <x86intrin.h> 3556 /// 3557 /// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction. 3558 /// 3559 /// \param __a 3560 /// A pointer to a 32-byte aligned memory location that will receive the 3561 /// integer values. 3562 /// \param __b 3563 /// A 256-bit vector of [4 x double] containing the values to be moved. 3564 static __inline void __DEFAULT_FN_ATTRS 3565 _mm256_stream_pd(double *__a, __m256d __b) 3566 { 3567 __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a); 3568 } 3569 3570 /// \brief Moves single-precision floating point values from a 256-bit vector 3571 /// of [8 x float] to a 32-byte aligned memory location. To minimize 3572 /// caching, the data is flagged as non-temporal (unlikely to be used again 3573 /// soon). 3574 /// 3575 /// \headerfile <x86intrin.h> 3576 /// 3577 /// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction. 3578 /// 3579 /// \param __p 3580 /// A pointer to a 32-byte aligned memory location that will receive the 3581 /// single-precision floating point values. 3582 /// \param __a 3583 /// A 256-bit vector of [8 x float] containing the values to be moved. 3584 static __inline void __DEFAULT_FN_ATTRS 3585 _mm256_stream_ps(float *__p, __m256 __a) 3586 { 3587 __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p); 3588 } 3589 3590 /* Create vectors */ 3591 /// \brief Create a 256-bit vector of [4 x double] with undefined values. 3592 /// 3593 /// \headerfile <x86intrin.h> 3594 /// 3595 /// This intrinsic has no corresponding instruction. 3596 /// 3597 /// \returns A 256-bit vector of [4 x double] containing undefined values. 3598 static __inline__ __m256d __DEFAULT_FN_ATTRS 3599 _mm256_undefined_pd(void) 3600 { 3601 return (__m256d)__builtin_ia32_undef256(); 3602 } 3603 3604 /// \brief Create a 256-bit vector of [8 x float] with undefined values. 3605 /// 3606 /// \headerfile <x86intrin.h> 3607 /// 3608 /// This intrinsic has no corresponding instruction. 3609 /// 3610 /// \returns A 256-bit vector of [8 x float] containing undefined values. 3611 static __inline__ __m256 __DEFAULT_FN_ATTRS 3612 _mm256_undefined_ps(void) 3613 { 3614 return (__m256)__builtin_ia32_undef256(); 3615 } 3616 3617 /// \brief Create a 256-bit integer vector with undefined values. 3618 /// 3619 /// \headerfile <x86intrin.h> 3620 /// 3621 /// This intrinsic has no corresponding instruction. 3622 /// 3623 /// \returns A 256-bit integer vector containing undefined values. 3624 static __inline__ __m256i __DEFAULT_FN_ATTRS 3625 _mm256_undefined_si256(void) 3626 { 3627 return (__m256i)__builtin_ia32_undef256(); 3628 } 3629 3630 /// \brief Constructs a 256-bit floating-point vector of [4 x double] 3631 /// initialized with the specified double-precision floating-point values. 3632 /// 3633 /// \headerfile <x86intrin.h> 3634 /// 3635 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3636 /// instruction. 3637 /// 3638 /// \param __a 3639 /// A double-precision floating-point value used to initialize bits [255:192] 3640 /// of the result. 3641 /// \param __b 3642 /// A double-precision floating-point value used to initialize bits [191:128] 3643 /// of the result. 3644 /// \param __c 3645 /// A double-precision floating-point value used to initialize bits [127:64] 3646 /// of the result. 3647 /// \param __d 3648 /// A double-precision floating-point value used to initialize bits [63:0] 3649 /// of the result. 3650 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 3651 static __inline __m256d __DEFAULT_FN_ATTRS 3652 _mm256_set_pd(double __a, double __b, double __c, double __d) 3653 { 3654 return (__m256d){ __d, __c, __b, __a }; 3655 } 3656 3657 /// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized 3658 /// with the specified single-precision floating-point values. 3659 /// 3660 /// \headerfile <x86intrin.h> 3661 /// 3662 /// This intrinsic is a utility function and does not correspond to a specific 3663 /// instruction. 3664 /// 3665 /// \param __a 3666 /// A single-precision floating-point value used to initialize bits [255:224] 3667 /// of the result. 3668 /// \param __b 3669 /// A single-precision floating-point value used to initialize bits [223:192] 3670 /// of the result. 3671 /// \param __c 3672 /// A single-precision floating-point value used to initialize bits [191:160] 3673 /// of the result. 3674 /// \param __d 3675 /// A single-precision floating-point value used to initialize bits [159:128] 3676 /// of the result. 3677 /// \param __e 3678 /// A single-precision floating-point value used to initialize bits [127:96] 3679 /// of the result. 3680 /// \param __f 3681 /// A single-precision floating-point value used to initialize bits [95:64] 3682 /// of the result. 3683 /// \param __g 3684 /// A single-precision floating-point value used to initialize bits [63:32] 3685 /// of the result. 3686 /// \param __h 3687 /// A single-precision floating-point value used to initialize bits [31:0] 3688 /// of the result. 3689 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 3690 static __inline __m256 __DEFAULT_FN_ATTRS 3691 _mm256_set_ps(float __a, float __b, float __c, float __d, 3692 float __e, float __f, float __g, float __h) 3693 { 3694 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 3695 } 3696 3697 /// \brief Constructs a 256-bit integer vector initialized with the specified 3698 /// 32-bit integral values. 3699 /// 3700 /// \headerfile <x86intrin.h> 3701 /// 3702 /// This intrinsic is a utility function and does not correspond to a specific 3703 /// instruction. 3704 /// 3705 /// \param __i0 3706 /// A 32-bit integral value used to initialize bits [255:224] of the result. 3707 /// \param __i1 3708 /// A 32-bit integral value used to initialize bits [223:192] of the result. 3709 /// \param __i2 3710 /// A 32-bit integral value used to initialize bits [191:160] of the result. 3711 /// \param __i3 3712 /// A 32-bit integral value used to initialize bits [159:128] of the result. 3713 /// \param __i4 3714 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3715 /// \param __i5 3716 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3717 /// \param __i6 3718 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3719 /// \param __i7 3720 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3721 /// \returns An initialized 256-bit integer vector. 3722 static __inline __m256i __DEFAULT_FN_ATTRS 3723 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 3724 int __i4, int __i5, int __i6, int __i7) 3725 { 3726 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 3727 } 3728 3729 /// \brief Constructs a 256-bit integer vector initialized with the specified 3730 /// 16-bit integral values. 3731 /// 3732 /// \headerfile <x86intrin.h> 3733 /// 3734 /// This intrinsic is a utility function and does not correspond to a specific 3735 /// instruction. 3736 /// 3737 /// \param __w15 3738 /// A 16-bit integral value used to initialize bits [255:240] of the result. 3739 /// \param __w14 3740 /// A 16-bit integral value used to initialize bits [239:224] of the result. 3741 /// \param __w13 3742 /// A 16-bit integral value used to initialize bits [223:208] of the result. 3743 /// \param __w12 3744 /// A 16-bit integral value used to initialize bits [207:192] of the result. 3745 /// \param __w11 3746 /// A 16-bit integral value used to initialize bits [191:176] of the result. 3747 /// \param __w10 3748 /// A 16-bit integral value used to initialize bits [175:160] of the result. 3749 /// \param __w09 3750 /// A 16-bit integral value used to initialize bits [159:144] of the result. 3751 /// \param __w08 3752 /// A 16-bit integral value used to initialize bits [143:128] of the result. 3753 /// \param __w07 3754 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3755 /// \param __w06 3756 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3757 /// \param __w05 3758 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3759 /// \param __w04 3760 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3761 /// \param __w03 3762 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3763 /// \param __w02 3764 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3765 /// \param __w01 3766 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3767 /// \param __w00 3768 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3769 /// \returns An initialized 256-bit integer vector. 3770 static __inline __m256i __DEFAULT_FN_ATTRS 3771 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 3772 short __w11, short __w10, short __w09, short __w08, 3773 short __w07, short __w06, short __w05, short __w04, 3774 short __w03, short __w02, short __w01, short __w00) 3775 { 3776 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 3777 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 3778 } 3779 3780 /// \brief Constructs a 256-bit integer vector initialized with the specified 3781 /// 8-bit integral values. 3782 /// 3783 /// \headerfile <x86intrin.h> 3784 /// 3785 /// This intrinsic is a utility function and does not correspond to a specific 3786 /// instruction. 3787 /// 3788 /// \param __b31 3789 /// An 8-bit integral value used to initialize bits [255:248] of the result. 3790 /// \param __b30 3791 /// An 8-bit integral value used to initialize bits [247:240] of the result. 3792 /// \param __b29 3793 /// An 8-bit integral value used to initialize bits [239:232] of the result. 3794 /// \param __b28 3795 /// An 8-bit integral value used to initialize bits [231:224] of the result. 3796 /// \param __b27 3797 /// An 8-bit integral value used to initialize bits [223:216] of the result. 3798 /// \param __b26 3799 /// An 8-bit integral value used to initialize bits [215:208] of the result. 3800 /// \param __b25 3801 /// An 8-bit integral value used to initialize bits [207:200] of the result. 3802 /// \param __b24 3803 /// An 8-bit integral value used to initialize bits [199:192] of the result. 3804 /// \param __b23 3805 /// An 8-bit integral value used to initialize bits [191:184] of the result. 3806 /// \param __b22 3807 /// An 8-bit integral value used to initialize bits [183:176] of the result. 3808 /// \param __b21 3809 /// An 8-bit integral value used to initialize bits [175:168] of the result. 3810 /// \param __b20 3811 /// An 8-bit integral value used to initialize bits [167:160] of the result. 3812 /// \param __b19 3813 /// An 8-bit integral value used to initialize bits [159:152] of the result. 3814 /// \param __b18 3815 /// An 8-bit integral value used to initialize bits [151:144] of the result. 3816 /// \param __b17 3817 /// An 8-bit integral value used to initialize bits [143:136] of the result. 3818 /// \param __b16 3819 /// An 8-bit integral value used to initialize bits [135:128] of the result. 3820 /// \param __b15 3821 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3822 /// \param __b14 3823 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3824 /// \param __b13 3825 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3826 /// \param __b12 3827 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3828 /// \param __b11 3829 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3830 /// \param __b10 3831 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3832 /// \param __b09 3833 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3834 /// \param __b08 3835 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3836 /// \param __b07 3837 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3838 /// \param __b06 3839 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3840 /// \param __b05 3841 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3842 /// \param __b04 3843 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3844 /// \param __b03 3845 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3846 /// \param __b02 3847 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3848 /// \param __b01 3849 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3850 /// \param __b00 3851 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3852 /// \returns An initialized 256-bit integer vector. 3853 static __inline __m256i __DEFAULT_FN_ATTRS 3854 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 3855 char __b27, char __b26, char __b25, char __b24, 3856 char __b23, char __b22, char __b21, char __b20, 3857 char __b19, char __b18, char __b17, char __b16, 3858 char __b15, char __b14, char __b13, char __b12, 3859 char __b11, char __b10, char __b09, char __b08, 3860 char __b07, char __b06, char __b05, char __b04, 3861 char __b03, char __b02, char __b01, char __b00) 3862 { 3863 return (__m256i)(__v32qi){ 3864 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 3865 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 3866 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 3867 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 3868 }; 3869 } 3870 3871 /// \brief Constructs a 256-bit integer vector initialized with the specified 3872 /// 64-bit integral values. 3873 /// 3874 /// \headerfile <x86intrin.h> 3875 /// 3876 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 3877 /// instruction. 3878 /// 3879 /// \param __a 3880 /// A 64-bit integral value used to initialize bits [255:192] of the result. 3881 /// \param __b 3882 /// A 64-bit integral value used to initialize bits [191:128] of the result. 3883 /// \param __c 3884 /// A 64-bit integral value used to initialize bits [127:64] of the result. 3885 /// \param __d 3886 /// A 64-bit integral value used to initialize bits [63:0] of the result. 3887 /// \returns An initialized 256-bit integer vector. 3888 static __inline __m256i __DEFAULT_FN_ATTRS 3889 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 3890 { 3891 return (__m256i)(__v4di){ __d, __c, __b, __a }; 3892 } 3893 3894 /* Create vectors with elements in reverse order */ 3895 /// \brief Constructs a 256-bit floating-point vector of [4 x double], 3896 /// initialized in reverse order with the specified double-precision 3897 /// floating-point values. 3898 /// 3899 /// \headerfile <x86intrin.h> 3900 /// 3901 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3902 /// instruction. 3903 /// 3904 /// \param __a 3905 /// A double-precision floating-point value used to initialize bits [63:0] 3906 /// of the result. 3907 /// \param __b 3908 /// A double-precision floating-point value used to initialize bits [127:64] 3909 /// of the result. 3910 /// \param __c 3911 /// A double-precision floating-point value used to initialize bits [191:128] 3912 /// of the result. 3913 /// \param __d 3914 /// A double-precision floating-point value used to initialize bits [255:192] 3915 /// of the result. 3916 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 3917 static __inline __m256d __DEFAULT_FN_ATTRS 3918 _mm256_setr_pd(double __a, double __b, double __c, double __d) 3919 { 3920 return (__m256d){ __a, __b, __c, __d }; 3921 } 3922 3923 /// \brief Constructs a 256-bit floating-point vector of [8 x float], 3924 /// initialized in reverse order with the specified single-precision 3925 /// float-point values. 3926 /// 3927 /// \headerfile <x86intrin.h> 3928 /// 3929 /// This intrinsic is a utility function and does not correspond to a specific 3930 /// instruction. 3931 /// 3932 /// \param __a 3933 /// A single-precision floating-point value used to initialize bits [31:0] 3934 /// of the result. 3935 /// \param __b 3936 /// A single-precision floating-point value used to initialize bits [63:32] 3937 /// of the result. 3938 /// \param __c 3939 /// A single-precision floating-point value used to initialize bits [95:64] 3940 /// of the result. 3941 /// \param __d 3942 /// A single-precision floating-point value used to initialize bits [127:96] 3943 /// of the result. 3944 /// \param __e 3945 /// A single-precision floating-point value used to initialize bits [159:128] 3946 /// of the result. 3947 /// \param __f 3948 /// A single-precision floating-point value used to initialize bits [191:160] 3949 /// of the result. 3950 /// \param __g 3951 /// A single-precision floating-point value used to initialize bits [223:192] 3952 /// of the result. 3953 /// \param __h 3954 /// A single-precision floating-point value used to initialize bits [255:224] 3955 /// of the result. 3956 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 3957 static __inline __m256 __DEFAULT_FN_ATTRS 3958 _mm256_setr_ps(float __a, float __b, float __c, float __d, 3959 float __e, float __f, float __g, float __h) 3960 { 3961 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; 3962 } 3963 3964 /// \brief Constructs a 256-bit integer vector, initialized in reverse order 3965 /// with the specified 32-bit integral values. 3966 /// 3967 /// \headerfile <x86intrin.h> 3968 /// 3969 /// This intrinsic is a utility function and does not correspond to a specific 3970 /// instruction. 3971 /// 3972 /// \param __i0 3973 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3974 /// \param __i1 3975 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3976 /// \param __i2 3977 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3978 /// \param __i3 3979 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3980 /// \param __i4 3981 /// A 32-bit integral value used to initialize bits [159:128] of the result. 3982 /// \param __i5 3983 /// A 32-bit integral value used to initialize bits [191:160] of the result. 3984 /// \param __i6 3985 /// A 32-bit integral value used to initialize bits [223:192] of the result. 3986 /// \param __i7 3987 /// A 32-bit integral value used to initialize bits [255:224] of the result. 3988 /// \returns An initialized 256-bit integer vector. 3989 static __inline __m256i __DEFAULT_FN_ATTRS 3990 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 3991 int __i4, int __i5, int __i6, int __i7) 3992 { 3993 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; 3994 } 3995 3996 /// \brief Constructs a 256-bit integer vector, initialized in reverse order 3997 /// with the specified 16-bit integral values. 3998 /// 3999 /// \headerfile <x86intrin.h> 4000 /// 4001 /// This intrinsic is a utility function and does not correspond to a specific 4002 /// instruction. 4003 /// 4004 /// \param __w15 4005 /// A 16-bit integral value used to initialize bits [15:0] of the result. 4006 /// \param __w14 4007 /// A 16-bit integral value used to initialize bits [31:16] of the result. 4008 /// \param __w13 4009 /// A 16-bit integral value used to initialize bits [47:32] of the result. 4010 /// \param __w12 4011 /// A 16-bit integral value used to initialize bits [63:48] of the result. 4012 /// \param __w11 4013 /// A 16-bit integral value used to initialize bits [79:64] of the result. 4014 /// \param __w10 4015 /// A 16-bit integral value used to initialize bits [95:80] of the result. 4016 /// \param __w09 4017 /// A 16-bit integral value used to initialize bits [111:96] of the result. 4018 /// \param __w08 4019 /// A 16-bit integral value used to initialize bits [127:112] of the result. 4020 /// \param __w07 4021 /// A 16-bit integral value used to initialize bits [143:128] of the result. 4022 /// \param __w06 4023 /// A 16-bit integral value used to initialize bits [159:144] of the result. 4024 /// \param __w05 4025 /// A 16-bit integral value used to initialize bits [175:160] of the result. 4026 /// \param __w04 4027 /// A 16-bit integral value used to initialize bits [191:176] of the result. 4028 /// \param __w03 4029 /// A 16-bit integral value used to initialize bits [207:192] of the result. 4030 /// \param __w02 4031 /// A 16-bit integral value used to initialize bits [223:208] of the result. 4032 /// \param __w01 4033 /// A 16-bit integral value used to initialize bits [239:224] of the result. 4034 /// \param __w00 4035 /// A 16-bit integral value used to initialize bits [255:240] of the result. 4036 /// \returns An initialized 256-bit integer vector. 4037 static __inline __m256i __DEFAULT_FN_ATTRS 4038 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 4039 short __w11, short __w10, short __w09, short __w08, 4040 short __w07, short __w06, short __w05, short __w04, 4041 short __w03, short __w02, short __w01, short __w00) 4042 { 4043 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, 4044 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; 4045 } 4046 4047 /// \brief Constructs a 256-bit integer vector, initialized in reverse order 4048 /// with the specified 8-bit integral values. 4049 /// 4050 /// \headerfile <x86intrin.h> 4051 /// 4052 /// This intrinsic is a utility function and does not correspond to a specific 4053 /// instruction. 4054 /// 4055 /// \param __b31 4056 /// An 8-bit integral value used to initialize bits [7:0] of the result. 4057 /// \param __b30 4058 /// An 8-bit integral value used to initialize bits [15:8] of the result. 4059 /// \param __b29 4060 /// An 8-bit integral value used to initialize bits [23:16] of the result. 4061 /// \param __b28 4062 /// An 8-bit integral value used to initialize bits [31:24] of the result. 4063 /// \param __b27 4064 /// An 8-bit integral value used to initialize bits [39:32] of the result. 4065 /// \param __b26 4066 /// An 8-bit integral value used to initialize bits [47:40] of the result. 4067 /// \param __b25 4068 /// An 8-bit integral value used to initialize bits [55:48] of the result. 4069 /// \param __b24 4070 /// An 8-bit integral value used to initialize bits [63:56] of the result. 4071 /// \param __b23 4072 /// An 8-bit integral value used to initialize bits [71:64] of the result. 4073 /// \param __b22 4074 /// An 8-bit integral value used to initialize bits [79:72] of the result. 4075 /// \param __b21 4076 /// An 8-bit integral value used to initialize bits [87:80] of the result. 4077 /// \param __b20 4078 /// An 8-bit integral value used to initialize bits [95:88] of the result. 4079 /// \param __b19 4080 /// An 8-bit integral value used to initialize bits [103:96] of the result. 4081 /// \param __b18 4082 /// An 8-bit integral value used to initialize bits [111:104] of the result. 4083 /// \param __b17 4084 /// An 8-bit integral value used to initialize bits [119:112] of the result. 4085 /// \param __b16 4086 /// An 8-bit integral value used to initialize bits [127:120] of the result. 4087 /// \param __b15 4088 /// An 8-bit integral value used to initialize bits [135:128] of the result. 4089 /// \param __b14 4090 /// An 8-bit integral value used to initialize bits [143:136] of the result. 4091 /// \param __b13 4092 /// An 8-bit integral value used to initialize bits [151:144] of the result. 4093 /// \param __b12 4094 /// An 8-bit integral value used to initialize bits [159:152] of the result. 4095 /// \param __b11 4096 /// An 8-bit integral value used to initialize bits [167:160] of the result. 4097 /// \param __b10 4098 /// An 8-bit integral value used to initialize bits [175:168] of the result. 4099 /// \param __b09 4100 /// An 8-bit integral value used to initialize bits [183:176] of the result. 4101 /// \param __b08 4102 /// An 8-bit integral value used to initialize bits [191:184] of the result. 4103 /// \param __b07 4104 /// An 8-bit integral value used to initialize bits [199:192] of the result. 4105 /// \param __b06 4106 /// An 8-bit integral value used to initialize bits [207:200] of the result. 4107 /// \param __b05 4108 /// An 8-bit integral value used to initialize bits [215:208] of the result. 4109 /// \param __b04 4110 /// An 8-bit integral value used to initialize bits [223:216] of the result. 4111 /// \param __b03 4112 /// An 8-bit integral value used to initialize bits [231:224] of the result. 4113 /// \param __b02 4114 /// An 8-bit integral value used to initialize bits [239:232] of the result. 4115 /// \param __b01 4116 /// An 8-bit integral value used to initialize bits [247:240] of the result. 4117 /// \param __b00 4118 /// An 8-bit integral value used to initialize bits [255:248] of the result. 4119 /// \returns An initialized 256-bit integer vector. 4120 static __inline __m256i __DEFAULT_FN_ATTRS 4121 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 4122 char __b27, char __b26, char __b25, char __b24, 4123 char __b23, char __b22, char __b21, char __b20, 4124 char __b19, char __b18, char __b17, char __b16, 4125 char __b15, char __b14, char __b13, char __b12, 4126 char __b11, char __b10, char __b09, char __b08, 4127 char __b07, char __b06, char __b05, char __b04, 4128 char __b03, char __b02, char __b01, char __b00) 4129 { 4130 return (__m256i)(__v32qi){ 4131 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, 4132 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, 4133 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, 4134 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; 4135 } 4136 4137 /// \brief Constructs a 256-bit integer vector, initialized in reverse order 4138 /// with the specified 64-bit integral values. 4139 /// 4140 /// \headerfile <x86intrin.h> 4141 /// 4142 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 4143 /// instruction. 4144 /// 4145 /// \param __a 4146 /// A 64-bit integral value used to initialize bits [63:0] of the result. 4147 /// \param __b 4148 /// A 64-bit integral value used to initialize bits [127:64] of the result. 4149 /// \param __c 4150 /// A 64-bit integral value used to initialize bits [191:128] of the result. 4151 /// \param __d 4152 /// A 64-bit integral value used to initialize bits [255:192] of the result. 4153 /// \returns An initialized 256-bit integer vector. 4154 static __inline __m256i __DEFAULT_FN_ATTRS 4155 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 4156 { 4157 return (__m256i)(__v4di){ __a, __b, __c, __d }; 4158 } 4159 4160 /* Create vectors with repeated elements */ 4161 /// \brief Constructs a 256-bit floating-point vector of [4 x double], with each 4162 /// of the four double-precision floating-point vector elements set to the 4163 /// specified double-precision floating-point value. 4164 /// 4165 /// \headerfile <x86intrin.h> 4166 /// 4167 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4168 /// 4169 /// \param __w 4170 /// A double-precision floating-point value used to initialize each vector 4171 /// element of the result. 4172 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 4173 static __inline __m256d __DEFAULT_FN_ATTRS 4174 _mm256_set1_pd(double __w) 4175 { 4176 return (__m256d){ __w, __w, __w, __w }; 4177 } 4178 4179 /// \brief Constructs a 256-bit floating-point vector of [8 x float], with each 4180 /// of the eight single-precision floating-point vector elements set to the 4181 /// specified single-precision floating-point value. 4182 /// 4183 /// \headerfile <x86intrin.h> 4184 /// 4185 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4186 /// instruction. 4187 /// 4188 /// \param __w 4189 /// A single-precision floating-point value used to initialize each vector 4190 /// element of the result. 4191 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 4192 static __inline __m256 __DEFAULT_FN_ATTRS 4193 _mm256_set1_ps(float __w) 4194 { 4195 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; 4196 } 4197 4198 /// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the 4199 /// 32-bit integral vector elements set to the specified 32-bit integral 4200 /// value. 4201 /// 4202 /// \headerfile <x86intrin.h> 4203 /// 4204 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4205 /// instruction. 4206 /// 4207 /// \param __i 4208 /// A 32-bit integral value used to initialize each vector element of the 4209 /// result. 4210 /// \returns An initialized 256-bit integer vector of [8 x i32]. 4211 static __inline __m256i __DEFAULT_FN_ATTRS 4212 _mm256_set1_epi32(int __i) 4213 { 4214 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; 4215 } 4216 4217 /// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the 4218 /// 16-bit integral vector elements set to the specified 16-bit integral 4219 /// value. 4220 /// 4221 /// \headerfile <x86intrin.h> 4222 /// 4223 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4224 /// 4225 /// \param __w 4226 /// A 16-bit integral value used to initialize each vector element of the 4227 /// result. 4228 /// \returns An initialized 256-bit integer vector of [16 x i16]. 4229 static __inline __m256i __DEFAULT_FN_ATTRS 4230 _mm256_set1_epi16(short __w) 4231 { 4232 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, 4233 __w, __w, __w, __w, __w, __w }; 4234 } 4235 4236 /// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the 4237 /// 8-bit integral vector elements set to the specified 8-bit integral value. 4238 /// 4239 /// \headerfile <x86intrin.h> 4240 /// 4241 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4242 /// 4243 /// \param __b 4244 /// An 8-bit integral value used to initialize each vector element of the 4245 /// result. 4246 /// \returns An initialized 256-bit integer vector of [32 x i8]. 4247 static __inline __m256i __DEFAULT_FN_ATTRS 4248 _mm256_set1_epi8(char __b) 4249 { 4250 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4251 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4252 __b, __b, __b, __b, __b, __b, __b }; 4253 } 4254 4255 /// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the 4256 /// 64-bit integral vector elements set to the specified 64-bit integral 4257 /// value. 4258 /// 4259 /// \headerfile <x86intrin.h> 4260 /// 4261 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4262 /// 4263 /// \param __q 4264 /// A 64-bit integral value used to initialize each vector element of the 4265 /// result. 4266 /// \returns An initialized 256-bit integer vector of [4 x i64]. 4267 static __inline __m256i __DEFAULT_FN_ATTRS 4268 _mm256_set1_epi64x(long long __q) 4269 { 4270 return (__m256i)(__v4di){ __q, __q, __q, __q }; 4271 } 4272 4273 /* Create __zeroed vectors */ 4274 /// \brief Constructs a 256-bit floating-point vector of [4 x double] with all 4275 /// vector elements initialized to zero. 4276 /// 4277 /// \headerfile <x86intrin.h> 4278 /// 4279 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4280 /// 4281 /// \returns A 256-bit vector of [4 x double] with all elements set to zero. 4282 static __inline __m256d __DEFAULT_FN_ATTRS 4283 _mm256_setzero_pd(void) 4284 { 4285 return (__m256d){ 0, 0, 0, 0 }; 4286 } 4287 4288 /// \brief Constructs a 256-bit floating-point vector of [8 x float] with all 4289 /// vector elements initialized to zero. 4290 /// 4291 /// \headerfile <x86intrin.h> 4292 /// 4293 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4294 /// 4295 /// \returns A 256-bit vector of [8 x float] with all elements set to zero. 4296 static __inline __m256 __DEFAULT_FN_ATTRS 4297 _mm256_setzero_ps(void) 4298 { 4299 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 4300 } 4301 4302 /// \brief Constructs a 256-bit integer vector initialized to zero. 4303 /// 4304 /// \headerfile <x86intrin.h> 4305 /// 4306 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4307 /// 4308 /// \returns A 256-bit integer vector initialized to zero. 4309 static __inline __m256i __DEFAULT_FN_ATTRS 4310 _mm256_setzero_si256(void) 4311 { 4312 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 4313 } 4314 4315 /* Cast between vector types */ 4316 /// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4317 /// floating-point vector of [8 x float]. 4318 /// 4319 /// \headerfile <x86intrin.h> 4320 /// 4321 /// This intrinsic has no corresponding instruction. 4322 /// 4323 /// \param __a 4324 /// A 256-bit floating-point vector of [4 x double]. 4325 /// \returns A 256-bit floating-point vector of [8 x float] containing the same 4326 /// bitwise pattern as the parameter. 4327 static __inline __m256 __DEFAULT_FN_ATTRS 4328 _mm256_castpd_ps(__m256d __a) 4329 { 4330 return (__m256)__a; 4331 } 4332 4333 /// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4334 /// integer vector. 4335 /// 4336 /// \headerfile <x86intrin.h> 4337 /// 4338 /// This intrinsic has no corresponding instruction. 4339 /// 4340 /// \param __a 4341 /// A 256-bit floating-point vector of [4 x double]. 4342 /// \returns A 256-bit integer vector containing the same bitwise pattern as the 4343 /// parameter. 4344 static __inline __m256i __DEFAULT_FN_ATTRS 4345 _mm256_castpd_si256(__m256d __a) 4346 { 4347 return (__m256i)__a; 4348 } 4349 4350 /// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4351 /// floating-point vector of [4 x double]. 4352 /// 4353 /// \headerfile <x86intrin.h> 4354 /// 4355 /// This intrinsic has no corresponding instruction. 4356 /// 4357 /// \param __a 4358 /// A 256-bit floating-point vector of [8 x float]. 4359 /// \returns A 256-bit floating-point vector of [4 x double] containing the same 4360 /// bitwise pattern as the parameter. 4361 static __inline __m256d __DEFAULT_FN_ATTRS 4362 _mm256_castps_pd(__m256 __a) 4363 { 4364 return (__m256d)__a; 4365 } 4366 4367 /// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4368 /// integer vector. 4369 /// 4370 /// \headerfile <x86intrin.h> 4371 /// 4372 /// This intrinsic has no corresponding instruction. 4373 /// 4374 /// \param __a 4375 /// A 256-bit floating-point vector of [8 x float]. 4376 /// \returns A 256-bit integer vector containing the same bitwise pattern as the 4377 /// parameter. 4378 static __inline __m256i __DEFAULT_FN_ATTRS 4379 _mm256_castps_si256(__m256 __a) 4380 { 4381 return (__m256i)__a; 4382 } 4383 4384 /// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4385 /// of [8 x float]. 4386 /// 4387 /// \headerfile <x86intrin.h> 4388 /// 4389 /// This intrinsic has no corresponding instruction. 4390 /// 4391 /// \param __a 4392 /// A 256-bit integer vector. 4393 /// \returns A 256-bit floating-point vector of [8 x float] containing the same 4394 /// bitwise pattern as the parameter. 4395 static __inline __m256 __DEFAULT_FN_ATTRS 4396 _mm256_castsi256_ps(__m256i __a) 4397 { 4398 return (__m256)__a; 4399 } 4400 4401 /// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4402 /// of [4 x double]. 4403 /// 4404 /// \headerfile <x86intrin.h> 4405 /// 4406 /// This intrinsic has no corresponding instruction. 4407 /// 4408 /// \param __a 4409 /// A 256-bit integer vector. 4410 /// \returns A 256-bit floating-point vector of [4 x double] containing the same 4411 /// bitwise pattern as the parameter. 4412 static __inline __m256d __DEFAULT_FN_ATTRS 4413 _mm256_castsi256_pd(__m256i __a) 4414 { 4415 return (__m256d)__a; 4416 } 4417 4418 /// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4419 /// [4 x double] as a 128-bit floating-point vector of [2 x double]. 4420 /// 4421 /// \headerfile <x86intrin.h> 4422 /// 4423 /// This intrinsic has no corresponding instruction. 4424 /// 4425 /// \param __a 4426 /// A 256-bit floating-point vector of [4 x double]. 4427 /// \returns A 128-bit floating-point vector of [2 x double] containing the 4428 /// lower 128 bits of the parameter. 4429 static __inline __m128d __DEFAULT_FN_ATTRS 4430 _mm256_castpd256_pd128(__m256d __a) 4431 { 4432 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); 4433 } 4434 4435 /// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4436 /// [8 x float] as a 128-bit floating-point vector of [4 x float]. 4437 /// 4438 /// \headerfile <x86intrin.h> 4439 /// 4440 /// This intrinsic has no corresponding instruction. 4441 /// 4442 /// \param __a 4443 /// A 256-bit floating-point vector of [8 x float]. 4444 /// \returns A 128-bit floating-point vector of [4 x float] containing the 4445 /// lower 128 bits of the parameter. 4446 static __inline __m128 __DEFAULT_FN_ATTRS 4447 _mm256_castps256_ps128(__m256 __a) 4448 { 4449 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); 4450 } 4451 4452 /// \brief Truncates a 256-bit integer vector into a 128-bit integer vector. 4453 /// 4454 /// \headerfile <x86intrin.h> 4455 /// 4456 /// This intrinsic has no corresponding instruction. 4457 /// 4458 /// \param __a 4459 /// A 256-bit integer vector. 4460 /// \returns A 128-bit integer vector containing the lower 128 bits of the 4461 /// parameter. 4462 static __inline __m128i __DEFAULT_FN_ATTRS 4463 _mm256_castsi256_si128(__m256i __a) 4464 { 4465 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); 4466 } 4467 4468 /// \brief Constructs a 256-bit floating-point vector of [4 x double] from a 4469 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits 4470 /// contain the value of the source vector. The contents of the upper 128 4471 /// bits are undefined. 4472 /// 4473 /// \headerfile <x86intrin.h> 4474 /// 4475 /// This intrinsic has no corresponding instruction. 4476 /// 4477 /// \param __a 4478 /// A 128-bit vector of [2 x double]. 4479 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4480 /// contain the value of the parameter. The contents of the upper 128 bits 4481 /// are undefined. 4482 static __inline __m256d __DEFAULT_FN_ATTRS 4483 _mm256_castpd128_pd256(__m128d __a) 4484 { 4485 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); 4486 } 4487 4488 /// \brief Constructs a 256-bit floating-point vector of [8 x float] from a 4489 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 4490 /// the value of the source vector. The contents of the upper 128 bits are 4491 /// undefined. 4492 /// 4493 /// \headerfile <x86intrin.h> 4494 /// 4495 /// This intrinsic has no corresponding instruction. 4496 /// 4497 /// \param __a 4498 /// A 128-bit vector of [4 x float]. 4499 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4500 /// contain the value of the parameter. The contents of the upper 128 bits 4501 /// are undefined. 4502 static __inline __m256 __DEFAULT_FN_ATTRS 4503 _mm256_castps128_ps256(__m128 __a) 4504 { 4505 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); 4506 } 4507 4508 /// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. 4509 /// The lower 128 bits contain the value of the source vector. The contents 4510 /// of the upper 128 bits are undefined. 4511 /// 4512 /// \headerfile <x86intrin.h> 4513 /// 4514 /// This intrinsic has no corresponding instruction. 4515 /// 4516 /// \param __a 4517 /// A 128-bit integer vector. 4518 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4519 /// the parameter. The contents of the upper 128 bits are undefined. 4520 static __inline __m256i __DEFAULT_FN_ATTRS 4521 _mm256_castsi128_si256(__m128i __a) 4522 { 4523 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); 4524 } 4525 4526 /* 4527 Vector insert. 4528 We use macros rather than inlines because we only want to accept 4529 invocations where the immediate M is a constant expression. 4530 */ 4531 /// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating 4532 /// a 256-bit vector of [8 x float] given in the first parameter, and then 4533 /// replacing either the upper or the lower 128 bits with the contents of a 4534 /// 128-bit vector of [4 x float] in the second parameter. The immediate 4535 /// integer parameter determines between the upper or the lower 128 bits. 4536 /// 4537 /// \headerfile <x86intrin.h> 4538 /// 4539 /// \code 4540 /// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); 4541 /// \endcode 4542 /// 4543 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4544 /// 4545 /// \param V1 4546 /// A 256-bit vector of [8 x float]. This vector is copied to the result 4547 /// first, and then either the upper or the lower 128 bits of the result will 4548 /// be replaced by the contents of \a V2. 4549 /// \param V2 4550 /// A 128-bit vector of [4 x float]. The contents of this parameter are 4551 /// written to either the upper or the lower 128 bits of the result depending 4552 /// on the value of parameter \a M. 4553 /// \param M 4554 /// An immediate integer. The least significant bit determines how the values 4555 /// from the two parameters are interleaved: \n 4556 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4557 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4558 /// result. \n 4559 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4560 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4561 /// result. 4562 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 4563 #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ 4564 (__m256)__builtin_shufflevector( \ 4565 (__v8sf)(__m256)(V1), \ 4566 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ 4567 (((M) & 1) ? 0 : 8), \ 4568 (((M) & 1) ? 1 : 9), \ 4569 (((M) & 1) ? 2 : 10), \ 4570 (((M) & 1) ? 3 : 11), \ 4571 (((M) & 1) ? 8 : 4), \ 4572 (((M) & 1) ? 9 : 5), \ 4573 (((M) & 1) ? 10 : 6), \ 4574 (((M) & 1) ? 11 : 7) );}) 4575 4576 /// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating 4577 /// a 256-bit vector of [4 x double] given in the first parameter, and then 4578 /// replacing either the upper or the lower 128 bits with the contents of a 4579 /// 128-bit vector of [2 x double] in the second parameter. The immediate 4580 /// integer parameter determines between the upper or the lower 128 bits. 4581 /// 4582 /// \headerfile <x86intrin.h> 4583 /// 4584 /// \code 4585 /// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); 4586 /// \endcode 4587 /// 4588 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4589 /// 4590 /// \param V1 4591 /// A 256-bit vector of [4 x double]. This vector is copied to the result 4592 /// first, and then either the upper or the lower 128 bits of the result will 4593 /// be replaced by the contents of \a V2. 4594 /// \param V2 4595 /// A 128-bit vector of [2 x double]. The contents of this parameter are 4596 /// written to either the upper or the lower 128 bits of the result depending 4597 /// on the value of parameter \a M. 4598 /// \param M 4599 /// An immediate integer. The least significant bit determines how the values 4600 /// from the two parameters are interleaved: \n 4601 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4602 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4603 /// result. \n 4604 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4605 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4606 /// result. 4607 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 4608 #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ 4609 (__m256d)__builtin_shufflevector( \ 4610 (__v4df)(__m256d)(V1), \ 4611 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ 4612 (((M) & 1) ? 0 : 4), \ 4613 (((M) & 1) ? 1 : 5), \ 4614 (((M) & 1) ? 4 : 2), \ 4615 (((M) & 1) ? 5 : 3) );}) 4616 4617 /// \brief Constructs a new 256-bit integer vector by first duplicating a 4618 /// 256-bit integer vector given in the first parameter, and then replacing 4619 /// either the upper or the lower 128 bits with the contents of a 128-bit 4620 /// integer vector in the second parameter. The immediate integer parameter 4621 /// determines between the upper or the lower 128 bits. 4622 /// 4623 /// \headerfile <x86intrin.h> 4624 /// 4625 /// \code 4626 /// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); 4627 /// \endcode 4628 /// 4629 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4630 /// 4631 /// \param V1 4632 /// A 256-bit integer vector. This vector is copied to the result first, and 4633 /// then either the upper or the lower 128 bits of the result will be 4634 /// replaced by the contents of \a V2. 4635 /// \param V2 4636 /// A 128-bit integer vector. The contents of this parameter are written to 4637 /// either the upper or the lower 128 bits of the result depending on the 4638 /// value of parameter \a M. 4639 /// \param M 4640 /// An immediate integer. The least significant bit determines how the values 4641 /// from the two parameters are interleaved: \n 4642 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4643 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4644 /// result. \n 4645 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4646 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4647 /// result. 4648 /// \returns A 256-bit integer vector containing the interleaved values. 4649 #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ 4650 (__m256i)__builtin_shufflevector( \ 4651 (__v4di)(__m256i)(V1), \ 4652 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ 4653 (((M) & 1) ? 0 : 4), \ 4654 (((M) & 1) ? 1 : 5), \ 4655 (((M) & 1) ? 4 : 2), \ 4656 (((M) & 1) ? 5 : 3) );}) 4657 4658 /* 4659 Vector extract. 4660 We use macros rather than inlines because we only want to accept 4661 invocations where the immediate M is a constant expression. 4662 */ 4663 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4664 /// of [8 x float], as determined by the immediate integer parameter, and 4665 /// returns the extracted bits as a 128-bit vector of [4 x float]. 4666 /// 4667 /// \headerfile <x86intrin.h> 4668 /// 4669 /// \code 4670 /// __m128 _mm256_extractf128_ps(__m256 V, const int M); 4671 /// \endcode 4672 /// 4673 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4674 /// 4675 /// \param V 4676 /// A 256-bit vector of [8 x float]. 4677 /// \param M 4678 /// An immediate integer. The least significant bit determines which bits are 4679 /// extracted from the first parameter: \n 4680 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4681 /// result. \n 4682 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4683 /// \returns A 128-bit vector of [4 x float] containing the extracted bits. 4684 #define _mm256_extractf128_ps(V, M) __extension__ ({ \ 4685 (__m128)__builtin_shufflevector( \ 4686 (__v8sf)(__m256)(V), \ 4687 (__v8sf)(_mm256_undefined_ps()), \ 4688 (((M) & 1) ? 4 : 0), \ 4689 (((M) & 1) ? 5 : 1), \ 4690 (((M) & 1) ? 6 : 2), \ 4691 (((M) & 1) ? 7 : 3) );}) 4692 4693 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4694 /// of [4 x double], as determined by the immediate integer parameter, and 4695 /// returns the extracted bits as a 128-bit vector of [2 x double]. 4696 /// 4697 /// \headerfile <x86intrin.h> 4698 /// 4699 /// \code 4700 /// __m128d _mm256_extractf128_pd(__m256d V, const int M); 4701 /// \endcode 4702 /// 4703 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4704 /// 4705 /// \param V 4706 /// A 256-bit vector of [4 x double]. 4707 /// \param M 4708 /// An immediate integer. The least significant bit determines which bits are 4709 /// extracted from the first parameter: \n 4710 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4711 /// result. \n 4712 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4713 /// \returns A 128-bit vector of [2 x double] containing the extracted bits. 4714 #define _mm256_extractf128_pd(V, M) __extension__ ({ \ 4715 (__m128d)__builtin_shufflevector( \ 4716 (__v4df)(__m256d)(V), \ 4717 (__v4df)(_mm256_undefined_pd()), \ 4718 (((M) & 1) ? 2 : 0), \ 4719 (((M) & 1) ? 3 : 1) );}) 4720 4721 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit 4722 /// integer vector, as determined by the immediate integer parameter, and 4723 /// returns the extracted bits as a 128-bit integer vector. 4724 /// 4725 /// \headerfile <x86intrin.h> 4726 /// 4727 /// \code 4728 /// __m128i _mm256_extractf128_si256(__m256i V, const int M); 4729 /// \endcode 4730 /// 4731 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4732 /// 4733 /// \param V 4734 /// A 256-bit integer vector. 4735 /// \param M 4736 /// An immediate integer. The least significant bit determines which bits are 4737 /// extracted from the first parameter: \n 4738 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4739 /// result. \n 4740 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4741 /// \returns A 128-bit integer vector containing the extracted bits. 4742 #define _mm256_extractf128_si256(V, M) __extension__ ({ \ 4743 (__m128i)__builtin_shufflevector( \ 4744 (__v4di)(__m256i)(V), \ 4745 (__v4di)(_mm256_undefined_si256()), \ 4746 (((M) & 1) ? 2 : 0), \ 4747 (((M) & 1) ? 3 : 1) );}) 4748 4749 /* SIMD load ops (unaligned) */ 4750 /// \brief Loads two 128-bit floating-point vectors of [4 x float] from 4751 /// unaligned memory locations and constructs a 256-bit floating-point vector 4752 /// of [8 x float] by concatenating the two 128-bit vectors. 4753 /// 4754 /// \headerfile <x86intrin.h> 4755 /// 4756 /// This intrinsic corresponds to load instructions followed by the 4757 /// <c> VINSERTF128 </c> instruction. 4758 /// 4759 /// \param __addr_hi 4760 /// A pointer to a 128-bit memory location containing 4 consecutive 4761 /// single-precision floating-point values. These values are to be copied to 4762 /// bits[255:128] of the result. The address of the memory location does not 4763 /// have to be aligned. 4764 /// \param __addr_lo 4765 /// A pointer to a 128-bit memory location containing 4 consecutive 4766 /// single-precision floating-point values. These values are to be copied to 4767 /// bits[127:0] of the result. The address of the memory location does not 4768 /// have to be aligned. 4769 /// \returns A 256-bit floating-point vector of [8 x float] containing the 4770 /// concatenated result. 4771 static __inline __m256 __DEFAULT_FN_ATTRS 4772 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 4773 { 4774 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo)); 4775 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); 4776 } 4777 4778 /// \brief Loads two 128-bit floating-point vectors of [2 x double] from 4779 /// unaligned memory locations and constructs a 256-bit floating-point vector 4780 /// of [4 x double] by concatenating the two 128-bit vectors. 4781 /// 4782 /// \headerfile <x86intrin.h> 4783 /// 4784 /// This intrinsic corresponds to load instructions followed by the 4785 /// <c> VINSERTF128 </c> instruction. 4786 /// 4787 /// \param __addr_hi 4788 /// A pointer to a 128-bit memory location containing two consecutive 4789 /// double-precision floating-point values. These values are to be copied to 4790 /// bits[255:128] of the result. The address of the memory location does not 4791 /// have to be aligned. 4792 /// \param __addr_lo 4793 /// A pointer to a 128-bit memory location containing two consecutive 4794 /// double-precision floating-point values. These values are to be copied to 4795 /// bits[127:0] of the result. The address of the memory location does not 4796 /// have to be aligned. 4797 /// \returns A 256-bit floating-point vector of [4 x double] containing the 4798 /// concatenated result. 4799 static __inline __m256d __DEFAULT_FN_ATTRS 4800 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 4801 { 4802 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo)); 4803 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); 4804 } 4805 4806 /// \brief Loads two 128-bit integer vectors from unaligned memory locations and 4807 /// constructs a 256-bit integer vector by concatenating the two 128-bit 4808 /// vectors. 4809 /// 4810 /// \headerfile <x86intrin.h> 4811 /// 4812 /// This intrinsic corresponds to load instructions followed by the 4813 /// <c> VINSERTF128 </c> instruction. 4814 /// 4815 /// \param __addr_hi 4816 /// A pointer to a 128-bit memory location containing a 128-bit integer 4817 /// vector. This vector is to be copied to bits[255:128] of the result. The 4818 /// address of the memory location does not have to be aligned. 4819 /// \param __addr_lo 4820 /// A pointer to a 128-bit memory location containing a 128-bit integer 4821 /// vector. This vector is to be copied to bits[127:0] of the result. The 4822 /// address of the memory location does not have to be aligned. 4823 /// \returns A 256-bit integer vector containing the concatenated result. 4824 static __inline __m256i __DEFAULT_FN_ATTRS 4825 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) 4826 { 4827 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); 4828 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); 4829 } 4830 4831 /* SIMD store ops (unaligned) */ 4832 /// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4833 /// vector of [8 x float] into two different unaligned memory locations. 4834 /// 4835 /// \headerfile <x86intrin.h> 4836 /// 4837 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4838 /// store instructions. 4839 /// 4840 /// \param __addr_hi 4841 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4842 /// copied to this memory location. The address of this memory location does 4843 /// not have to be aligned. 4844 /// \param __addr_lo 4845 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4846 /// copied to this memory location. The address of this memory location does 4847 /// not have to be aligned. 4848 /// \param __a 4849 /// A 256-bit floating-point vector of [8 x float]. 4850 static __inline void __DEFAULT_FN_ATTRS 4851 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 4852 { 4853 __m128 __v128; 4854 4855 __v128 = _mm256_castps256_ps128(__a); 4856 _mm_storeu_ps(__addr_lo, __v128); 4857 __v128 = _mm256_extractf128_ps(__a, 1); 4858 _mm_storeu_ps(__addr_hi, __v128); 4859 } 4860 4861 /// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4862 /// vector of [4 x double] into two different unaligned memory locations. 4863 /// 4864 /// \headerfile <x86intrin.h> 4865 /// 4866 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4867 /// store instructions. 4868 /// 4869 /// \param __addr_hi 4870 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4871 /// copied to this memory location. The address of this memory location does 4872 /// not have to be aligned. 4873 /// \param __addr_lo 4874 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4875 /// copied to this memory location. The address of this memory location does 4876 /// not have to be aligned. 4877 /// \param __a 4878 /// A 256-bit floating-point vector of [4 x double]. 4879 static __inline void __DEFAULT_FN_ATTRS 4880 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 4881 { 4882 __m128d __v128; 4883 4884 __v128 = _mm256_castpd256_pd128(__a); 4885 _mm_storeu_pd(__addr_lo, __v128); 4886 __v128 = _mm256_extractf128_pd(__a, 1); 4887 _mm_storeu_pd(__addr_hi, __v128); 4888 } 4889 4890 /// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into 4891 /// two different unaligned memory locations. 4892 /// 4893 /// \headerfile <x86intrin.h> 4894 /// 4895 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4896 /// store instructions. 4897 /// 4898 /// \param __addr_hi 4899 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4900 /// copied to this memory location. The address of this memory location does 4901 /// not have to be aligned. 4902 /// \param __addr_lo 4903 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4904 /// copied to this memory location. The address of this memory location does 4905 /// not have to be aligned. 4906 /// \param __a 4907 /// A 256-bit integer vector. 4908 static __inline void __DEFAULT_FN_ATTRS 4909 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) 4910 { 4911 __m128i __v128; 4912 4913 __v128 = _mm256_castsi256_si128(__a); 4914 _mm_storeu_si128(__addr_lo, __v128); 4915 __v128 = _mm256_extractf128_si256(__a, 1); 4916 _mm_storeu_si128(__addr_hi, __v128); 4917 } 4918 4919 /// \brief Constructs a 256-bit floating-point vector of [8 x float] by 4920 /// concatenating two 128-bit floating-point vectors of [4 x float]. 4921 /// 4922 /// \headerfile <x86intrin.h> 4923 /// 4924 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4925 /// 4926 /// \param __hi 4927 /// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4928 /// 128 bits of the result. 4929 /// \param __lo 4930 /// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4931 /// 128 bits of the result. 4932 /// \returns A 256-bit floating-point vector of [8 x float] containing the 4933 /// concatenated result. 4934 static __inline __m256 __DEFAULT_FN_ATTRS 4935 _mm256_set_m128 (__m128 __hi, __m128 __lo) 4936 { 4937 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); 4938 } 4939 4940 /// \brief Constructs a 256-bit floating-point vector of [4 x double] by 4941 /// concatenating two 128-bit floating-point vectors of [2 x double]. 4942 /// 4943 /// \headerfile <x86intrin.h> 4944 /// 4945 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4946 /// 4947 /// \param __hi 4948 /// A 128-bit floating-point vector of [2 x double] to be copied to the upper 4949 /// 128 bits of the result. 4950 /// \param __lo 4951 /// A 128-bit floating-point vector of [2 x double] to be copied to the lower 4952 /// 128 bits of the result. 4953 /// \returns A 256-bit floating-point vector of [4 x double] containing the 4954 /// concatenated result. 4955 static __inline __m256d __DEFAULT_FN_ATTRS 4956 _mm256_set_m128d (__m128d __hi, __m128d __lo) 4957 { 4958 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 4959 } 4960 4961 /// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 4962 /// integer vectors. 4963 /// 4964 /// \headerfile <x86intrin.h> 4965 /// 4966 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4967 /// 4968 /// \param __hi 4969 /// A 128-bit integer vector to be copied to the upper 128 bits of the 4970 /// result. 4971 /// \param __lo 4972 /// A 128-bit integer vector to be copied to the lower 128 bits of the 4973 /// result. 4974 /// \returns A 256-bit integer vector containing the concatenated result. 4975 static __inline __m256i __DEFAULT_FN_ATTRS 4976 _mm256_set_m128i (__m128i __hi, __m128i __lo) 4977 { 4978 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 4979 } 4980 4981 /// \brief Constructs a 256-bit floating-point vector of [8 x float] by 4982 /// concatenating two 128-bit floating-point vectors of [4 x float]. This is 4983 /// similar to _mm256_set_m128, but the order of the input parameters is 4984 /// swapped. 4985 /// 4986 /// \headerfile <x86intrin.h> 4987 /// 4988 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4989 /// 4990 /// \param __lo 4991 /// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4992 /// 128 bits of the result. 4993 /// \param __hi 4994 /// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4995 /// 128 bits of the result. 4996 /// \returns A 256-bit floating-point vector of [8 x float] containing the 4997 /// concatenated result. 4998 static __inline __m256 __DEFAULT_FN_ATTRS 4999 _mm256_setr_m128 (__m128 __lo, __m128 __hi) 5000 { 5001 return _mm256_set_m128(__hi, __lo); 5002 } 5003 5004 /// \brief Constructs a 256-bit floating-point vector of [4 x double] by 5005 /// concatenating two 128-bit floating-point vectors of [2 x double]. This is 5006 /// similar to _mm256_set_m128d, but the order of the input parameters is 5007 /// swapped. 5008 /// 5009 /// \headerfile <x86intrin.h> 5010 /// 5011 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5012 /// 5013 /// \param __lo 5014 /// A 128-bit floating-point vector of [2 x double] to be copied to the lower 5015 /// 128 bits of the result. 5016 /// \param __hi 5017 /// A 128-bit floating-point vector of [2 x double] to be copied to the upper 5018 /// 128 bits of the result. 5019 /// \returns A 256-bit floating-point vector of [4 x double] containing the 5020 /// concatenated result. 5021 static __inline __m256d __DEFAULT_FN_ATTRS 5022 _mm256_setr_m128d (__m128d __lo, __m128d __hi) 5023 { 5024 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5025 } 5026 5027 /// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 5028 /// integer vectors. This is similar to _mm256_set_m128i, but the order of 5029 /// the input parameters is swapped. 5030 /// 5031 /// \headerfile <x86intrin.h> 5032 /// 5033 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5034 /// 5035 /// \param __lo 5036 /// A 128-bit integer vector to be copied to the lower 128 bits of the 5037 /// result. 5038 /// \param __hi 5039 /// A 128-bit integer vector to be copied to the upper 128 bits of the 5040 /// result. 5041 /// \returns A 256-bit integer vector containing the concatenated result. 5042 static __inline __m256i __DEFAULT_FN_ATTRS 5043 _mm256_setr_m128i (__m128i __lo, __m128i __hi) 5044 { 5045 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5046 } 5047 5048 #undef __DEFAULT_FN_ATTRS 5049 5050 #endif /* __AVXINTRIN_H */ 5051