1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __IMMINTRIN_H 25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26 #endif 27 28 #ifndef __AVXINTRIN_H 29 #define __AVXINTRIN_H 30 31 typedef double __v4df __attribute__ ((__vector_size__ (32))); 32 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 33 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 34 typedef int __v8si __attribute__ ((__vector_size__ (32))); 35 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 36 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 37 38 /* Unsigned types */ 39 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 40 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 41 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 42 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 43 44 /* We need an explicitly signed variant for char. Note that this shouldn't 45 * appear in the interface though. */ 46 typedef signed char __v32qs __attribute__((__vector_size__(32))); 47 48 typedef float __m256 __attribute__ ((__vector_size__ (32))); 49 typedef double __m256d __attribute__((__vector_size__(32))); 50 typedef long long __m256i __attribute__((__vector_size__(32))); 51 52 /* Define the default attributes for the functions in this file. */ 53 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 54 55 /* Arithmetic */ 56 /// \brief Adds two 256-bit vectors of [4 x double]. 57 /// 58 /// \headerfile <x86intrin.h> 59 /// 60 /// This intrinsic corresponds to the <c> VADDPD </c> instruction. 61 /// 62 /// \param __a 63 /// A 256-bit vector of [4 x double] containing one of the source operands. 64 /// \param __b 65 /// A 256-bit vector of [4 x double] containing one of the source operands. 66 /// \returns A 256-bit vector of [4 x double] containing the sums of both 67 /// operands. 68 static __inline __m256d __DEFAULT_FN_ATTRS 69 _mm256_add_pd(__m256d __a, __m256d __b) 70 { 71 return (__m256d)((__v4df)__a+(__v4df)__b); 72 } 73 74 /// \brief Adds two 256-bit vectors of [8 x float]. 75 /// 76 /// \headerfile <x86intrin.h> 77 /// 78 /// This intrinsic corresponds to the <c> VADDPS </c> instruction. 79 /// 80 /// \param __a 81 /// A 256-bit vector of [8 x float] containing one of the source operands. 82 /// \param __b 83 /// A 256-bit vector of [8 x float] containing one of the source operands. 84 /// \returns A 256-bit vector of [8 x float] containing the sums of both 85 /// operands. 86 static __inline __m256 __DEFAULT_FN_ATTRS 87 _mm256_add_ps(__m256 __a, __m256 __b) 88 { 89 return (__m256)((__v8sf)__a+(__v8sf)__b); 90 } 91 92 /// \brief Subtracts two 256-bit vectors of [4 x double]. 93 /// 94 /// \headerfile <x86intrin.h> 95 /// 96 /// This intrinsic corresponds to the <c> VSUBPD </c> instruction. 97 /// 98 /// \param __a 99 /// A 256-bit vector of [4 x double] containing the minuend. 100 /// \param __b 101 /// A 256-bit vector of [4 x double] containing the subtrahend. 102 /// \returns A 256-bit vector of [4 x double] containing the differences between 103 /// both operands. 104 static __inline __m256d __DEFAULT_FN_ATTRS 105 _mm256_sub_pd(__m256d __a, __m256d __b) 106 { 107 return (__m256d)((__v4df)__a-(__v4df)__b); 108 } 109 110 /// \brief Subtracts two 256-bit vectors of [8 x float]. 111 /// 112 /// \headerfile <x86intrin.h> 113 /// 114 /// This intrinsic corresponds to the <c> VSUBPS </c> instruction. 115 /// 116 /// \param __a 117 /// A 256-bit vector of [8 x float] containing the minuend. 118 /// \param __b 119 /// A 256-bit vector of [8 x float] containing the subtrahend. 120 /// \returns A 256-bit vector of [8 x float] containing the differences between 121 /// both operands. 122 static __inline __m256 __DEFAULT_FN_ATTRS 123 _mm256_sub_ps(__m256 __a, __m256 __b) 124 { 125 return (__m256)((__v8sf)__a-(__v8sf)__b); 126 } 127 128 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 129 /// two 256-bit vectors of [4 x double]. 130 /// 131 /// \headerfile <x86intrin.h> 132 /// 133 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 134 /// 135 /// \param __a 136 /// A 256-bit vector of [4 x double] containing the left source operand. 137 /// \param __b 138 /// A 256-bit vector of [4 x double] containing the right source operand. 139 /// \returns A 256-bit vector of [4 x double] containing the alternating sums 140 /// and differences between both operands. 141 static __inline __m256d __DEFAULT_FN_ATTRS 142 _mm256_addsub_pd(__m256d __a, __m256d __b) 143 { 144 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 145 } 146 147 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 148 /// two 256-bit vectors of [8 x float]. 149 /// 150 /// \headerfile <x86intrin.h> 151 /// 152 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 153 /// 154 /// \param __a 155 /// A 256-bit vector of [8 x float] containing the left source operand. 156 /// \param __b 157 /// A 256-bit vector of [8 x float] containing the right source operand. 158 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and 159 /// differences between both operands. 160 static __inline __m256 __DEFAULT_FN_ATTRS 161 _mm256_addsub_ps(__m256 __a, __m256 __b) 162 { 163 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 164 } 165 166 /// \brief Divides two 256-bit vectors of [4 x double]. 167 /// 168 /// \headerfile <x86intrin.h> 169 /// 170 /// This intrinsic corresponds to the <c> VDIVPD </c> instruction. 171 /// 172 /// \param __a 173 /// A 256-bit vector of [4 x double] containing the dividend. 174 /// \param __b 175 /// A 256-bit vector of [4 x double] containing the divisor. 176 /// \returns A 256-bit vector of [4 x double] containing the quotients of both 177 /// operands. 178 static __inline __m256d __DEFAULT_FN_ATTRS 179 _mm256_div_pd(__m256d __a, __m256d __b) 180 { 181 return (__m256d)((__v4df)__a/(__v4df)__b); 182 } 183 184 /// \brief Divides two 256-bit vectors of [8 x float]. 185 /// 186 /// \headerfile <x86intrin.h> 187 /// 188 /// This intrinsic corresponds to the <c> VDIVPS </c> instruction. 189 /// 190 /// \param __a 191 /// A 256-bit vector of [8 x float] containing the dividend. 192 /// \param __b 193 /// A 256-bit vector of [8 x float] containing the divisor. 194 /// \returns A 256-bit vector of [8 x float] containing the quotients of both 195 /// operands. 196 static __inline __m256 __DEFAULT_FN_ATTRS 197 _mm256_div_ps(__m256 __a, __m256 __b) 198 { 199 return (__m256)((__v8sf)__a/(__v8sf)__b); 200 } 201 202 /// \brief Compares two 256-bit vectors of [4 x double] and returns the greater 203 /// of each pair of values. 204 /// 205 /// \headerfile <x86intrin.h> 206 /// 207 /// This intrinsic corresponds to the <c> VMAXPD </c> instruction. 208 /// 209 /// \param __a 210 /// A 256-bit vector of [4 x double] containing one of the operands. 211 /// \param __b 212 /// A 256-bit vector of [4 x double] containing one of the operands. 213 /// \returns A 256-bit vector of [4 x double] containing the maximum values 214 /// between both operands. 215 static __inline __m256d __DEFAULT_FN_ATTRS 216 _mm256_max_pd(__m256d __a, __m256d __b) 217 { 218 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 219 } 220 221 /// \brief Compares two 256-bit vectors of [8 x float] and returns the greater 222 /// of each pair of values. 223 /// 224 /// \headerfile <x86intrin.h> 225 /// 226 /// This intrinsic corresponds to the <c> VMAXPS </c> instruction. 227 /// 228 /// \param __a 229 /// A 256-bit vector of [8 x float] containing one of the operands. 230 /// \param __b 231 /// A 256-bit vector of [8 x float] containing one of the operands. 232 /// \returns A 256-bit vector of [8 x float] containing the maximum values 233 /// between both operands. 234 static __inline __m256 __DEFAULT_FN_ATTRS 235 _mm256_max_ps(__m256 __a, __m256 __b) 236 { 237 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 238 } 239 240 /// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser 241 /// of each pair of values. 242 /// 243 /// \headerfile <x86intrin.h> 244 /// 245 /// This intrinsic corresponds to the <c> VMINPD </c> instruction. 246 /// 247 /// \param __a 248 /// A 256-bit vector of [4 x double] containing one of the operands. 249 /// \param __b 250 /// A 256-bit vector of [4 x double] containing one of the operands. 251 /// \returns A 256-bit vector of [4 x double] containing the minimum values 252 /// between both operands. 253 static __inline __m256d __DEFAULT_FN_ATTRS 254 _mm256_min_pd(__m256d __a, __m256d __b) 255 { 256 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 257 } 258 259 /// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser 260 /// of each pair of values. 261 /// 262 /// \headerfile <x86intrin.h> 263 /// 264 /// This intrinsic corresponds to the <c> VMINPS </c> instruction. 265 /// 266 /// \param __a 267 /// A 256-bit vector of [8 x float] containing one of the operands. 268 /// \param __b 269 /// A 256-bit vector of [8 x float] containing one of the operands. 270 /// \returns A 256-bit vector of [8 x float] containing the minimum values 271 /// between both operands. 272 static __inline __m256 __DEFAULT_FN_ATTRS 273 _mm256_min_ps(__m256 __a, __m256 __b) 274 { 275 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 276 } 277 278 /// \brief Multiplies two 256-bit vectors of [4 x double]. 279 /// 280 /// \headerfile <x86intrin.h> 281 /// 282 /// This intrinsic corresponds to the <c> VMULPD </c> instruction. 283 /// 284 /// \param __a 285 /// A 256-bit vector of [4 x double] containing one of the operands. 286 /// \param __b 287 /// A 256-bit vector of [4 x double] containing one of the operands. 288 /// \returns A 256-bit vector of [4 x double] containing the products of both 289 /// operands. 290 static __inline __m256d __DEFAULT_FN_ATTRS 291 _mm256_mul_pd(__m256d __a, __m256d __b) 292 { 293 return (__m256d)((__v4df)__a * (__v4df)__b); 294 } 295 296 /// \brief Multiplies two 256-bit vectors of [8 x float]. 297 /// 298 /// \headerfile <x86intrin.h> 299 /// 300 /// This intrinsic corresponds to the <c> VMULPS </c> instruction. 301 /// 302 /// \param __a 303 /// A 256-bit vector of [8 x float] containing one of the operands. 304 /// \param __b 305 /// A 256-bit vector of [8 x float] containing one of the operands. 306 /// \returns A 256-bit vector of [8 x float] containing the products of both 307 /// operands. 308 static __inline __m256 __DEFAULT_FN_ATTRS 309 _mm256_mul_ps(__m256 __a, __m256 __b) 310 { 311 return (__m256)((__v8sf)__a * (__v8sf)__b); 312 } 313 314 /// \brief Calculates the square roots of the values in a 256-bit vector of 315 /// [4 x double]. 316 /// 317 /// \headerfile <x86intrin.h> 318 /// 319 /// This intrinsic corresponds to the <c> VSQRTPD </c> instruction. 320 /// 321 /// \param __a 322 /// A 256-bit vector of [4 x double]. 323 /// \returns A 256-bit vector of [4 x double] containing the square roots of the 324 /// values in the operand. 325 static __inline __m256d __DEFAULT_FN_ATTRS 326 _mm256_sqrt_pd(__m256d __a) 327 { 328 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 329 } 330 331 /// \brief Calculates the square roots of the values in a 256-bit vector of 332 /// [8 x float]. 333 /// 334 /// \headerfile <x86intrin.h> 335 /// 336 /// This intrinsic corresponds to the <c> VSQRTPS </c> instruction. 337 /// 338 /// \param __a 339 /// A 256-bit vector of [8 x float]. 340 /// \returns A 256-bit vector of [8 x float] containing the square roots of the 341 /// values in the operand. 342 static __inline __m256 __DEFAULT_FN_ATTRS 343 _mm256_sqrt_ps(__m256 __a) 344 { 345 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 346 } 347 348 /// \brief Calculates the reciprocal square roots of the values in a 256-bit 349 /// vector of [8 x float]. 350 /// 351 /// \headerfile <x86intrin.h> 352 /// 353 /// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction. 354 /// 355 /// \param __a 356 /// A 256-bit vector of [8 x float]. 357 /// \returns A 256-bit vector of [8 x float] containing the reciprocal square 358 /// roots of the values in the operand. 359 static __inline __m256 __DEFAULT_FN_ATTRS 360 _mm256_rsqrt_ps(__m256 __a) 361 { 362 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 363 } 364 365 /// \brief Calculates the reciprocals of the values in a 256-bit vector of 366 /// [8 x float]. 367 /// 368 /// \headerfile <x86intrin.h> 369 /// 370 /// This intrinsic corresponds to the <c> VRCPPS </c> instruction. 371 /// 372 /// \param __a 373 /// A 256-bit vector of [8 x float]. 374 /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 375 /// values in the operand. 376 static __inline __m256 __DEFAULT_FN_ATTRS 377 _mm256_rcp_ps(__m256 __a) 378 { 379 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 380 } 381 382 /// \brief Rounds the values in a 256-bit vector of [4 x double] as specified 383 /// by the byte operand. The source values are rounded to integer values and 384 /// returned as 64-bit double-precision floating-point values. 385 /// 386 /// \headerfile <x86intrin.h> 387 /// 388 /// \code 389 /// __m256d _mm256_round_pd(__m256d V, const int M); 390 /// \endcode 391 /// 392 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 393 /// 394 /// \param V 395 /// A 256-bit vector of [4 x double]. 396 /// \param M 397 /// An integer value that specifies the rounding operation. \n 398 /// Bits [7:4] are reserved. \n 399 /// Bit [3] is a precision exception value: \n 400 /// 0: A normal PE exception is used. \n 401 /// 1: The PE field is not updated. \n 402 /// Bit [2] is the rounding control source: \n 403 /// 0: Use bits [1:0] of \a M. \n 404 /// 1: Use the current MXCSR setting. \n 405 /// Bits [1:0] contain the rounding control definition: \n 406 /// 00: Nearest. \n 407 /// 01: Downward (toward negative infinity). \n 408 /// 10: Upward (toward positive infinity). \n 409 /// 11: Truncated. 410 /// \returns A 256-bit vector of [4 x double] containing the rounded values. 411 #define _mm256_round_pd(V, M) __extension__ ({ \ 412 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) 413 414 /// \brief Rounds the values stored in a 256-bit vector of [8 x float] as 415 /// specified by the byte operand. The source values are rounded to integer 416 /// values and returned as floating-point values. 417 /// 418 /// \headerfile <x86intrin.h> 419 /// 420 /// \code 421 /// __m256 _mm256_round_ps(__m256 V, const int M); 422 /// \endcode 423 /// 424 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 425 /// 426 /// \param V 427 /// A 256-bit vector of [8 x float]. 428 /// \param M 429 /// An integer value that specifies the rounding operation. \n 430 /// Bits [7:4] are reserved. \n 431 /// Bit [3] is a precision exception value: \n 432 /// 0: A normal PE exception is used. \n 433 /// 1: The PE field is not updated. \n 434 /// Bit [2] is the rounding control source: \n 435 /// 0: Use bits [1:0] of \a M. \n 436 /// 1: Use the current MXCSR setting. \n 437 /// Bits [1:0] contain the rounding control definition: \n 438 /// 00: Nearest. \n 439 /// 01: Downward (toward negative infinity). \n 440 /// 10: Upward (toward positive infinity). \n 441 /// 11: Truncated. 442 /// \returns A 256-bit vector of [8 x float] containing the rounded values. 443 #define _mm256_round_ps(V, M) __extension__ ({ \ 444 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) 445 446 /// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The 447 /// source values are rounded up to integer values and returned as 64-bit 448 /// double-precision floating-point values. 449 /// 450 /// \headerfile <x86intrin.h> 451 /// 452 /// \code 453 /// __m256d _mm256_ceil_pd(__m256d V); 454 /// \endcode 455 /// 456 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 457 /// 458 /// \param V 459 /// A 256-bit vector of [4 x double]. 460 /// \returns A 256-bit vector of [4 x double] containing the rounded up values. 461 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 462 463 /// \brief Rounds down the values stored in a 256-bit vector of [4 x double]. 464 /// The source values are rounded down to integer values and returned as 465 /// 64-bit double-precision floating-point values. 466 /// 467 /// \headerfile <x86intrin.h> 468 /// 469 /// \code 470 /// __m256d _mm256_floor_pd(__m256d V); 471 /// \endcode 472 /// 473 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 474 /// 475 /// \param V 476 /// A 256-bit vector of [4 x double]. 477 /// \returns A 256-bit vector of [4 x double] containing the rounded down 478 /// values. 479 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 480 481 /// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The 482 /// source values are rounded up to integer values and returned as 483 /// floating-point values. 484 /// 485 /// \headerfile <x86intrin.h> 486 /// 487 /// \code 488 /// __m256 _mm256_ceil_ps(__m256 V); 489 /// \endcode 490 /// 491 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 492 /// 493 /// \param V 494 /// A 256-bit vector of [8 x float]. 495 /// \returns A 256-bit vector of [8 x float] containing the rounded up values. 496 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 497 498 /// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The 499 /// source values are rounded down to integer values and returned as 500 /// floating-point values. 501 /// 502 /// \headerfile <x86intrin.h> 503 /// 504 /// \code 505 /// __m256 _mm256_floor_ps(__m256 V); 506 /// \endcode 507 /// 508 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 509 /// 510 /// \param V 511 /// A 256-bit vector of [8 x float]. 512 /// \returns A 256-bit vector of [8 x float] containing the rounded down values. 513 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 514 515 /* Logical */ 516 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double]. 517 /// 518 /// \headerfile <x86intrin.h> 519 /// 520 /// This intrinsic corresponds to the <c> VANDPD </c> instruction. 521 /// 522 /// \param __a 523 /// A 256-bit vector of [4 x double] containing one of the source operands. 524 /// \param __b 525 /// A 256-bit vector of [4 x double] containing one of the source operands. 526 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 527 /// values between both operands. 528 static __inline __m256d __DEFAULT_FN_ATTRS 529 _mm256_and_pd(__m256d __a, __m256d __b) 530 { 531 return (__m256d)((__v4du)__a & (__v4du)__b); 532 } 533 534 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float]. 535 /// 536 /// \headerfile <x86intrin.h> 537 /// 538 /// This intrinsic corresponds to the <c> VANDPS </c> instruction. 539 /// 540 /// \param __a 541 /// A 256-bit vector of [8 x float] containing one of the source operands. 542 /// \param __b 543 /// A 256-bit vector of [8 x float] containing one of the source operands. 544 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 545 /// values between both operands. 546 static __inline __m256 __DEFAULT_FN_ATTRS 547 _mm256_and_ps(__m256 __a, __m256 __b) 548 { 549 return (__m256)((__v8su)__a & (__v8su)__b); 550 } 551 552 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using 553 /// the one's complement of the values contained in the first source operand. 554 /// 555 /// \headerfile <x86intrin.h> 556 /// 557 /// This intrinsic corresponds to the <c> VANDNPD </c> instruction. 558 /// 559 /// \param __a 560 /// A 256-bit vector of [4 x double] containing the left source operand. The 561 /// one's complement of this value is used in the bitwise AND. 562 /// \param __b 563 /// A 256-bit vector of [4 x double] containing the right source operand. 564 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 565 /// values of the second operand and the one's complement of the first 566 /// operand. 567 static __inline __m256d __DEFAULT_FN_ATTRS 568 _mm256_andnot_pd(__m256d __a, __m256d __b) 569 { 570 return (__m256d)(~(__v4du)__a & (__v4du)__b); 571 } 572 573 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using 574 /// the one's complement of the values contained in the first source operand. 575 /// 576 /// \headerfile <x86intrin.h> 577 /// 578 /// This intrinsic corresponds to the <c> VANDNPS </c> instruction. 579 /// 580 /// \param __a 581 /// A 256-bit vector of [8 x float] containing the left source operand. The 582 /// one's complement of this value is used in the bitwise AND. 583 /// \param __b 584 /// A 256-bit vector of [8 x float] containing the right source operand. 585 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 586 /// values of the second operand and the one's complement of the first 587 /// operand. 588 static __inline __m256 __DEFAULT_FN_ATTRS 589 _mm256_andnot_ps(__m256 __a, __m256 __b) 590 { 591 return (__m256)(~(__v8su)__a & (__v8su)__b); 592 } 593 594 /// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double]. 595 /// 596 /// \headerfile <x86intrin.h> 597 /// 598 /// This intrinsic corresponds to the <c> VORPD </c> instruction. 599 /// 600 /// \param __a 601 /// A 256-bit vector of [4 x double] containing one of the source operands. 602 /// \param __b 603 /// A 256-bit vector of [4 x double] containing one of the source operands. 604 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 605 /// values between both operands. 606 static __inline __m256d __DEFAULT_FN_ATTRS 607 _mm256_or_pd(__m256d __a, __m256d __b) 608 { 609 return (__m256d)((__v4du)__a | (__v4du)__b); 610 } 611 612 /// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float]. 613 /// 614 /// \headerfile <x86intrin.h> 615 /// 616 /// This intrinsic corresponds to the <c> VORPS </c> instruction. 617 /// 618 /// \param __a 619 /// A 256-bit vector of [8 x float] containing one of the source operands. 620 /// \param __b 621 /// A 256-bit vector of [8 x float] containing one of the source operands. 622 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 623 /// values between both operands. 624 static __inline __m256 __DEFAULT_FN_ATTRS 625 _mm256_or_ps(__m256 __a, __m256 __b) 626 { 627 return (__m256)((__v8su)__a | (__v8su)__b); 628 } 629 630 /// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 631 /// 632 /// \headerfile <x86intrin.h> 633 /// 634 /// This intrinsic corresponds to the <c> VXORPD </c> instruction. 635 /// 636 /// \param __a 637 /// A 256-bit vector of [4 x double] containing one of the source operands. 638 /// \param __b 639 /// A 256-bit vector of [4 x double] containing one of the source operands. 640 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 641 /// values between both operands. 642 static __inline __m256d __DEFAULT_FN_ATTRS 643 _mm256_xor_pd(__m256d __a, __m256d __b) 644 { 645 return (__m256d)((__v4du)__a ^ (__v4du)__b); 646 } 647 648 /// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 649 /// 650 /// \headerfile <x86intrin.h> 651 /// 652 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 653 /// 654 /// \param __a 655 /// A 256-bit vector of [8 x float] containing one of the source operands. 656 /// \param __b 657 /// A 256-bit vector of [8 x float] containing one of the source operands. 658 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 659 /// values between both operands. 660 static __inline __m256 __DEFAULT_FN_ATTRS 661 _mm256_xor_ps(__m256 __a, __m256 __b) 662 { 663 return (__m256)((__v8su)__a ^ (__v8su)__b); 664 } 665 666 /* Horizontal arithmetic */ 667 /// \brief Horizontally adds the adjacent pairs of values contained in two 668 /// 256-bit vectors of [4 x double]. 669 /// 670 /// \headerfile <x86intrin.h> 671 /// 672 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 673 /// 674 /// \param __a 675 /// A 256-bit vector of [4 x double] containing one of the source operands. 676 /// The horizontal sums of the values are returned in the even-indexed 677 /// elements of a vector of [4 x double]. 678 /// \param __b 679 /// A 256-bit vector of [4 x double] containing one of the source operands. 680 /// The horizontal sums of the values are returned in the odd-indexed 681 /// elements of a vector of [4 x double]. 682 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 683 /// both operands. 684 static __inline __m256d __DEFAULT_FN_ATTRS 685 _mm256_hadd_pd(__m256d __a, __m256d __b) 686 { 687 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 688 } 689 690 /// \brief Horizontally adds the adjacent pairs of values contained in two 691 /// 256-bit vectors of [8 x float]. 692 /// 693 /// \headerfile <x86intrin.h> 694 /// 695 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 696 /// 697 /// \param __a 698 /// A 256-bit vector of [8 x float] containing one of the source operands. 699 /// The horizontal sums of the values are returned in the elements with 700 /// index 0, 1, 4, 5 of a vector of [8 x float]. 701 /// \param __b 702 /// A 256-bit vector of [8 x float] containing one of the source operands. 703 /// The horizontal sums of the values are returned in the elements with 704 /// index 2, 3, 6, 7 of a vector of [8 x float]. 705 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 706 /// both operands. 707 static __inline __m256 __DEFAULT_FN_ATTRS 708 _mm256_hadd_ps(__m256 __a, __m256 __b) 709 { 710 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 711 } 712 713 /// \brief Horizontally subtracts the adjacent pairs of values contained in two 714 /// 256-bit vectors of [4 x double]. 715 /// 716 /// \headerfile <x86intrin.h> 717 /// 718 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 719 /// 720 /// \param __a 721 /// A 256-bit vector of [4 x double] containing one of the source operands. 722 /// The horizontal differences between the values are returned in the 723 /// even-indexed elements of a vector of [4 x double]. 724 /// \param __b 725 /// A 256-bit vector of [4 x double] containing one of the source operands. 726 /// The horizontal differences between the values are returned in the 727 /// odd-indexed elements of a vector of [4 x double]. 728 /// \returns A 256-bit vector of [4 x double] containing the horizontal 729 /// differences of both operands. 730 static __inline __m256d __DEFAULT_FN_ATTRS 731 _mm256_hsub_pd(__m256d __a, __m256d __b) 732 { 733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 734 } 735 736 /// \brief Horizontally subtracts the adjacent pairs of values contained in two 737 /// 256-bit vectors of [8 x float]. 738 /// 739 /// \headerfile <x86intrin.h> 740 /// 741 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 742 /// 743 /// \param __a 744 /// A 256-bit vector of [8 x float] containing one of the source operands. 745 /// The horizontal differences between the values are returned in the 746 /// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 747 /// \param __b 748 /// A 256-bit vector of [8 x float] containing one of the source operands. 749 /// The horizontal differences between the values are returned in the 750 /// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 751 /// \returns A 256-bit vector of [8 x float] containing the horizontal 752 /// differences of both operands. 753 static __inline __m256 __DEFAULT_FN_ATTRS 754 _mm256_hsub_ps(__m256 __a, __m256 __b) 755 { 756 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 757 } 758 759 /* Vector permutations */ 760 /// \brief Copies the values in a 128-bit vector of [2 x double] as specified 761 /// by the 128-bit integer vector operand. 762 /// 763 /// \headerfile <x86intrin.h> 764 /// 765 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 766 /// 767 /// \param __a 768 /// A 128-bit vector of [2 x double]. 769 /// \param __c 770 /// A 128-bit integer vector operand specifying how the values are to be 771 /// copied. \n 772 /// Bit [1]: \n 773 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 774 /// vector. \n 775 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 776 /// returned vector. \n 777 /// Bit [65]: \n 778 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 779 /// returned vector. \n 780 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 781 /// returned vector. 782 /// \returns A 128-bit vector of [2 x double] containing the copied values. 783 static __inline __m128d __DEFAULT_FN_ATTRS 784 _mm_permutevar_pd(__m128d __a, __m128i __c) 785 { 786 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 787 } 788 789 /// \brief Copies the values in a 256-bit vector of [4 x double] as specified 790 /// by the 256-bit integer vector operand. 791 /// 792 /// \headerfile <x86intrin.h> 793 /// 794 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 795 /// 796 /// \param __a 797 /// A 256-bit vector of [4 x double]. 798 /// \param __c 799 /// A 256-bit integer vector operand specifying how the values are to be 800 /// copied. \n 801 /// Bit [1]: \n 802 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 803 /// vector. \n 804 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 805 /// returned vector. \n 806 /// Bit [65]: \n 807 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 808 /// returned vector. \n 809 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 810 /// returned vector. \n 811 /// Bit [129]: \n 812 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 813 /// returned vector. \n 814 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 815 /// returned vector. \n 816 /// Bit [193]: \n 817 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 818 /// returned vector. \n 819 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 820 /// returned vector. 821 /// \returns A 256-bit vector of [4 x double] containing the copied values. 822 static __inline __m256d __DEFAULT_FN_ATTRS 823 _mm256_permutevar_pd(__m256d __a, __m256i __c) 824 { 825 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 826 } 827 828 /// \brief Copies the values stored in a 128-bit vector of [4 x float] as 829 /// specified by the 128-bit integer vector operand. 830 /// \headerfile <x86intrin.h> 831 /// 832 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 833 /// 834 /// \param __a 835 /// A 128-bit vector of [4 x float]. 836 /// \param __c 837 /// A 128-bit integer vector operand specifying how the values are to be 838 /// copied. \n 839 /// Bits [1:0]: \n 840 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 841 /// returned vector. \n 842 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 843 /// returned vector. \n 844 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 845 /// returned vector. \n 846 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 847 /// returned vector. \n 848 /// Bits [33:32]: \n 849 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 850 /// returned vector. \n 851 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 852 /// returned vector. \n 853 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 854 /// returned vector. \n 855 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 856 /// returned vector. \n 857 /// Bits [65:64]: \n 858 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 859 /// returned vector. \n 860 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 861 /// returned vector. \n 862 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 863 /// returned vector. \n 864 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 865 /// returned vector. \n 866 /// Bits [97:96]: \n 867 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 868 /// returned vector. \n 869 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 870 /// returned vector. \n 871 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 872 /// returned vector. \n 873 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 874 /// returned vector. 875 /// \returns A 128-bit vector of [4 x float] containing the copied values. 876 static __inline __m128 __DEFAULT_FN_ATTRS 877 _mm_permutevar_ps(__m128 __a, __m128i __c) 878 { 879 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 880 } 881 882 /// \brief Copies the values stored in a 256-bit vector of [8 x float] as 883 /// specified by the 256-bit integer vector operand. 884 /// 885 /// \headerfile <x86intrin.h> 886 /// 887 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 888 /// 889 /// \param __a 890 /// A 256-bit vector of [8 x float]. 891 /// \param __c 892 /// A 256-bit integer vector operand specifying how the values are to be 893 /// copied. \n 894 /// Bits [1:0]: \n 895 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 896 /// returned vector. \n 897 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 898 /// returned vector. \n 899 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 900 /// returned vector. \n 901 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 902 /// returned vector. \n 903 /// Bits [33:32]: \n 904 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 905 /// returned vector. \n 906 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 907 /// returned vector. \n 908 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 909 /// returned vector. \n 910 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 911 /// returned vector. \n 912 /// Bits [65:64]: \n 913 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 914 /// returned vector. \n 915 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 916 /// returned vector. \n 917 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 918 /// returned vector. \n 919 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 920 /// returned vector. \n 921 /// Bits [97:96]: \n 922 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 923 /// returned vector. \n 924 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 925 /// returned vector. \n 926 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 927 /// returned vector. \n 928 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 929 /// returned vector. \n 930 /// Bits [129:128]: \n 931 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 932 /// returned vector. \n 933 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 934 /// returned vector. \n 935 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 936 /// returned vector. \n 937 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 938 /// returned vector. \n 939 /// Bits [161:160]: \n 940 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 941 /// returned vector. \n 942 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 943 /// returned vector. \n 944 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 945 /// returned vector. \n 946 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 947 /// returned vector. \n 948 /// Bits [193:192]: \n 949 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 950 /// returned vector. \n 951 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 952 /// returned vector. \n 953 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 954 /// returned vector. \n 955 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 956 /// returned vector. \n 957 /// Bits [225:224]: \n 958 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 959 /// returned vector. \n 960 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 961 /// returned vector. \n 962 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 963 /// returned vector. \n 964 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 965 /// returned vector. 966 /// \returns A 256-bit vector of [8 x float] containing the copied values. 967 static __inline __m256 __DEFAULT_FN_ATTRS 968 _mm256_permutevar_ps(__m256 __a, __m256i __c) 969 { 970 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 971 } 972 973 /// \brief Copies the values in a 128-bit vector of [2 x double] as specified 974 /// by the immediate integer operand. 975 /// 976 /// \headerfile <x86intrin.h> 977 /// 978 /// \code 979 /// __m128d _mm_permute_pd(__m128d A, const int C); 980 /// \endcode 981 /// 982 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 983 /// 984 /// \param A 985 /// A 128-bit vector of [2 x double]. 986 /// \param C 987 /// An immediate integer operand specifying how the values are to be 988 /// copied. \n 989 /// Bit [0]: \n 990 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 991 /// vector. \n 992 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 993 /// returned vector. \n 994 /// Bit [1]: \n 995 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 996 /// returned vector. \n 997 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 998 /// returned vector. 999 /// \returns A 128-bit vector of [2 x double] containing the copied values. 1000 #define _mm_permute_pd(A, C) __extension__ ({ \ 1001 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ 1002 (__v2df)_mm_undefined_pd(), \ 1003 ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); }) 1004 1005 /// \brief Copies the values in a 256-bit vector of [4 x double] as specified by 1006 /// the immediate integer operand. 1007 /// 1008 /// \headerfile <x86intrin.h> 1009 /// 1010 /// \code 1011 /// __m256d _mm256_permute_pd(__m256d A, const int C); 1012 /// \endcode 1013 /// 1014 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 1015 /// 1016 /// \param A 1017 /// A 256-bit vector of [4 x double]. 1018 /// \param C 1019 /// An immediate integer operand specifying how the values are to be 1020 /// copied. \n 1021 /// Bit [0]: \n 1022 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1023 /// vector. \n 1024 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1025 /// returned vector. \n 1026 /// Bit [1]: \n 1027 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1028 /// returned vector. \n 1029 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1030 /// returned vector. \n 1031 /// Bit [2]: \n 1032 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1033 /// returned vector. \n 1034 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1035 /// returned vector. \n 1036 /// Bit [3]: \n 1037 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1038 /// returned vector. \n 1039 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1040 /// returned vector. 1041 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1042 #define _mm256_permute_pd(A, C) __extension__ ({ \ 1043 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ 1044 (__v4df)_mm256_undefined_pd(), \ 1045 0 + (((C) >> 0) & 0x1), \ 1046 0 + (((C) >> 1) & 0x1), \ 1047 2 + (((C) >> 2) & 0x1), \ 1048 2 + (((C) >> 3) & 0x1)); }) 1049 1050 /// \brief Copies the values in a 128-bit vector of [4 x float] as specified by 1051 /// the immediate integer operand. 1052 /// 1053 /// \headerfile <x86intrin.h> 1054 /// 1055 /// \code 1056 /// __m128 _mm_permute_ps(__m128 A, const int C); 1057 /// \endcode 1058 /// 1059 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1060 /// 1061 /// \param A 1062 /// A 128-bit vector of [4 x float]. 1063 /// \param C 1064 /// An immediate integer operand specifying how the values are to be 1065 /// copied. \n 1066 /// Bits [1:0]: \n 1067 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1068 /// returned vector. \n 1069 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1070 /// returned vector. \n 1071 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1072 /// returned vector. \n 1073 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1074 /// returned vector. \n 1075 /// Bits [3:2]: \n 1076 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1077 /// returned vector. \n 1078 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1079 /// returned vector. \n 1080 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1081 /// returned vector. \n 1082 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1083 /// returned vector. \n 1084 /// Bits [5:4]: \n 1085 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1086 /// returned vector. \n 1087 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1088 /// returned vector. \n 1089 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1090 /// returned vector. \n 1091 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1092 /// returned vector. \n 1093 /// Bits [7:6]: \n 1094 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1095 /// returned vector. \n 1096 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1097 /// returned vector. \n 1098 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1099 /// returned vector. \n 1100 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1101 /// returned vector. 1102 /// \returns A 128-bit vector of [4 x float] containing the copied values. 1103 #define _mm_permute_ps(A, C) __extension__ ({ \ 1104 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ 1105 (__v4sf)_mm_undefined_ps(), \ 1106 ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ 1107 ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) 1108 1109 /// \brief Copies the values in a 256-bit vector of [8 x float] as specified by 1110 /// the immediate integer operand. 1111 /// 1112 /// \headerfile <x86intrin.h> 1113 /// 1114 /// \code 1115 /// __m256 _mm256_permute_ps(__m256 A, const int C); 1116 /// \endcode 1117 /// 1118 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1119 /// 1120 /// \param A 1121 /// A 256-bit vector of [8 x float]. 1122 /// \param C 1123 /// An immediate integer operand specifying how the values are to be \n 1124 /// copied. \n 1125 /// Bits [1:0]: \n 1126 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1127 /// returned vector. \n 1128 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1129 /// returned vector. \n 1130 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1131 /// returned vector. \n 1132 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1133 /// returned vector. \n 1134 /// Bits [3:2]: \n 1135 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1136 /// returned vector. \n 1137 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1138 /// returned vector. \n 1139 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1140 /// returned vector. \n 1141 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1142 /// returned vector. \n 1143 /// Bits [5:4]: \n 1144 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1145 /// returned vector. \n 1146 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1147 /// returned vector. \n 1148 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1149 /// returned vector. \n 1150 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1151 /// returned vector. \n 1152 /// Bits [7:6]: \n 1153 /// 00: Bits [31:qq0] of the source are copied to bits [127:96] of the 1154 /// returned vector. \n 1155 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1156 /// returned vector. \n 1157 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1158 /// returned vector. \n 1159 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1160 /// returned vector. \n 1161 /// Bits [1:0]: \n 1162 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1163 /// returned vector. \n 1164 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1165 /// returned vector. \n 1166 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1167 /// returned vector. \n 1168 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1169 /// returned vector. \n 1170 /// Bits [3:2]: \n 1171 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1172 /// returned vector. \n 1173 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1174 /// returned vector. \n 1175 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1176 /// returned vector. \n 1177 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1178 /// returned vector. \n 1179 /// Bits [5:4]: \n 1180 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1181 /// returned vector. \n 1182 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1183 /// returned vector. \n 1184 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1185 /// returned vector. \n 1186 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1187 /// returned vector. \n 1188 /// Bits [7:6]: \n 1189 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1190 /// returned vector. \n 1191 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1192 /// returned vector. \n 1193 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1194 /// returned vector. \n 1195 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1196 /// returned vector. 1197 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1198 #define _mm256_permute_ps(A, C) __extension__ ({ \ 1199 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ 1200 (__v8sf)_mm256_undefined_ps(), \ 1201 0 + (((C) >> 0) & 0x3), \ 1202 0 + (((C) >> 2) & 0x3), \ 1203 0 + (((C) >> 4) & 0x3), \ 1204 0 + (((C) >> 6) & 0x3), \ 1205 4 + (((C) >> 0) & 0x3), \ 1206 4 + (((C) >> 2) & 0x3), \ 1207 4 + (((C) >> 4) & 0x3), \ 1208 4 + (((C) >> 6) & 0x3)); }) 1209 1210 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1211 /// [4 x double], as specified by the immediate integer operand. 1212 /// 1213 /// \headerfile <x86intrin.h> 1214 /// 1215 /// \code 1216 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); 1217 /// \endcode 1218 /// 1219 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1220 /// 1221 /// \param V1 1222 /// A 256-bit vector of [4 x double]. 1223 /// \param V2 1224 /// A 256-bit vector of [4 x double. 1225 /// \param M 1226 /// An immediate integer operand specifying how the values are to be 1227 /// permuted. \n 1228 /// Bits [1:0]: \n 1229 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1230 /// destination. \n 1231 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1232 /// destination. \n 1233 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1234 /// destination. \n 1235 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1236 /// destination. \n 1237 /// Bits [5:4]: \n 1238 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1239 /// destination. \n 1240 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1241 /// destination. \n 1242 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1243 /// destination. \n 1244 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1245 /// destination. 1246 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1247 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 1248 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1249 (__v4df)(__m256d)(V2), (M)); }) 1250 1251 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1252 /// [8 x float], as specified by the immediate integer operand. 1253 /// 1254 /// \headerfile <x86intrin.h> 1255 /// 1256 /// \code 1257 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); 1258 /// \endcode 1259 /// 1260 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1261 /// 1262 /// \param V1 1263 /// A 256-bit vector of [8 x float]. 1264 /// \param V2 1265 /// A 256-bit vector of [8 x float]. 1266 /// \param M 1267 /// An immediate integer operand specifying how the values are to be 1268 /// permuted. \n 1269 /// Bits [1:0]: \n 1270 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1271 /// destination. \n 1272 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1273 /// destination. \n 1274 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1275 /// destination. \n 1276 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1277 /// destination. \n 1278 /// Bits [5:4]: \n 1279 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1280 /// destination. \n 1281 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1282 /// destination. \n 1283 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1284 /// destination. \n 1285 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1286 /// destination. 1287 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1288 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 1289 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1290 (__v8sf)(__m256)(V2), (M)); }) 1291 1292 /// \brief Permutes 128-bit data values stored in two 256-bit integer vectors, 1293 /// as specified by the immediate integer operand. 1294 /// 1295 /// \headerfile <x86intrin.h> 1296 /// 1297 /// \code 1298 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); 1299 /// \endcode 1300 /// 1301 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1302 /// 1303 /// \param V1 1304 /// A 256-bit integer vector. 1305 /// \param V2 1306 /// A 256-bit integer vector. 1307 /// \param M 1308 /// An immediate integer operand specifying how the values are to be copied. 1309 /// Bits [1:0]: \n 1310 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1311 /// destination. \n 1312 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1313 /// destination. \n 1314 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1315 /// destination. \n 1316 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1317 /// destination. \n 1318 /// Bits [5:4]: \n 1319 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1320 /// destination. \n 1321 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1322 /// destination. \n 1323 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1324 /// destination. \n 1325 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1326 /// destination. 1327 /// \returns A 256-bit integer vector containing the copied values. 1328 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 1329 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1330 (__v8si)(__m256i)(V2), (M)); }) 1331 1332 /* Vector Blend */ 1333 /// \brief Merges 64-bit double-precision data values stored in either of the 1334 /// two 256-bit vectors of [4 x double], as specified by the immediate 1335 /// integer operand. 1336 /// 1337 /// \headerfile <x86intrin.h> 1338 /// 1339 /// \code 1340 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); 1341 /// \endcode 1342 /// 1343 /// This intrinsic corresponds to the <c> VBLENDPD </c> instruction. 1344 /// 1345 /// \param V1 1346 /// A 256-bit vector of [4 x double]. 1347 /// \param V2 1348 /// A 256-bit vector of [4 x double]. 1349 /// \param M 1350 /// An immediate integer operand, with mask bits [3:0] specifying how the 1351 /// values are to be copied. The position of the mask bit corresponds to the 1352 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 1353 /// element in operand \a V1 is copied to the same position in the 1354 /// destination. When a mask bit is 1, the corresponding 64-bit element in 1355 /// operand \a V2 is copied to the same position in the destination. 1356 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1357 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 1358 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ 1359 (__v4df)(__m256d)(V2), \ 1360 (((M) & 0x01) ? 4 : 0), \ 1361 (((M) & 0x02) ? 5 : 1), \ 1362 (((M) & 0x04) ? 6 : 2), \ 1363 (((M) & 0x08) ? 7 : 3)); }) 1364 1365 /// \brief Merges 32-bit single-precision data values stored in either of the 1366 /// two 256-bit vectors of [8 x float], as specified by the immediate 1367 /// integer operand. 1368 /// 1369 /// \headerfile <x86intrin.h> 1370 /// 1371 /// \code 1372 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); 1373 /// \endcode 1374 /// 1375 /// This intrinsic corresponds to the <c> VBLENDPS </c> instruction. 1376 /// 1377 /// \param V1 1378 /// A 256-bit vector of [8 x float]. 1379 /// \param V2 1380 /// A 256-bit vector of [8 x float]. 1381 /// \param M 1382 /// An immediate integer operand, with mask bits [7:0] specifying how the 1383 /// values are to be copied. The position of the mask bit corresponds to the 1384 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 1385 /// element in operand \a V1 is copied to the same position in the 1386 /// destination. When a mask bit is 1, the corresponding 32-bit element in 1387 /// operand \a V2 is copied to the same position in the destination. 1388 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1389 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 1390 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ 1391 (__v8sf)(__m256)(V2), \ 1392 (((M) & 0x01) ? 8 : 0), \ 1393 (((M) & 0x02) ? 9 : 1), \ 1394 (((M) & 0x04) ? 10 : 2), \ 1395 (((M) & 0x08) ? 11 : 3), \ 1396 (((M) & 0x10) ? 12 : 4), \ 1397 (((M) & 0x20) ? 13 : 5), \ 1398 (((M) & 0x40) ? 14 : 6), \ 1399 (((M) & 0x80) ? 15 : 7)); }) 1400 1401 /// \brief Merges 64-bit double-precision data values stored in either of the 1402 /// two 256-bit vectors of [4 x double], as specified by the 256-bit vector 1403 /// operand. 1404 /// 1405 /// \headerfile <x86intrin.h> 1406 /// 1407 /// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction. 1408 /// 1409 /// \param __a 1410 /// A 256-bit vector of [4 x double]. 1411 /// \param __b 1412 /// A 256-bit vector of [4 x double]. 1413 /// \param __c 1414 /// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying 1415 /// how the values are to be copied. The position of the mask bit corresponds 1416 /// to the most significant bit of a copied value. When a mask bit is 0, the 1417 /// corresponding 64-bit element in operand \a __a is copied to the same 1418 /// position in the destination. When a mask bit is 1, the corresponding 1419 /// 64-bit element in operand \a __b is copied to the same position in the 1420 /// destination. 1421 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1422 static __inline __m256d __DEFAULT_FN_ATTRS 1423 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1424 { 1425 return (__m256d)__builtin_ia32_blendvpd256( 1426 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1427 } 1428 1429 /// \brief Merges 32-bit single-precision data values stored in either of the 1430 /// two 256-bit vectors of [8 x float], as specified by the 256-bit vector 1431 /// operand. 1432 /// 1433 /// \headerfile <x86intrin.h> 1434 /// 1435 /// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction. 1436 /// 1437 /// \param __a 1438 /// A 256-bit vector of [8 x float]. 1439 /// \param __b 1440 /// A 256-bit vector of [8 x float]. 1441 /// \param __c 1442 /// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, 1443 /// and 31 specifying how the values are to be copied. The position of the 1444 /// mask bit corresponds to the most significant bit of a copied value. When 1445 /// a mask bit is 0, the corresponding 32-bit element in operand \a __a is 1446 /// copied to the same position in the destination. When a mask bit is 1, the 1447 /// corresponding 32-bit element in operand \a __b is copied to the same 1448 /// position in the destination. 1449 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1450 static __inline __m256 __DEFAULT_FN_ATTRS 1451 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1452 { 1453 return (__m256)__builtin_ia32_blendvps256( 1454 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1455 } 1456 1457 /* Vector Dot Product */ 1458 /// \brief Computes two dot products in parallel, using the lower and upper 1459 /// halves of two [8 x float] vectors as input to the two computations, and 1460 /// returning the two dot products in the lower and upper halves of the 1461 /// [8 x float] result. 1462 /// 1463 /// The immediate integer operand controls which input elements will 1464 /// contribute to the dot product, and where the final results are returned. 1465 /// In general, for each dot product, the four corresponding elements of the 1466 /// input vectors are multiplied; the first two and second two products are 1467 /// summed, then the two sums are added to form the final result. 1468 /// 1469 /// \headerfile <x86intrin.h> 1470 /// 1471 /// \code 1472 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); 1473 /// \endcode 1474 /// 1475 /// This intrinsic corresponds to the <c> VDPPS </c> instruction. 1476 /// 1477 /// \param V1 1478 /// A vector of [8 x float] values, treated as two [4 x float] vectors. 1479 /// \param V2 1480 /// A vector of [8 x float] values, treated as two [4 x float] vectors. 1481 /// \param M 1482 /// An immediate integer argument. Bits [7:4] determine which elements of 1483 /// the input vectors are used, with bit [4] corresponding to the lowest 1484 /// element and bit [7] corresponding to the highest element of each [4 x 1485 /// float] subvector. If a bit is set, the corresponding elements from the 1486 /// two input vectors are used as an input for dot product; otherwise that 1487 /// input is treated as zero. Bits [3:0] determine which elements of the 1488 /// result will receive a copy of the final dot product, with bit [0] 1489 /// corresponding to the lowest element and bit [3] corresponding to the 1490 /// highest element of each [4 x float] subvector. If a bit is set, the dot 1491 /// product is returned in the corresponding element; otherwise that element 1492 /// is set to zero. The bitmask is applied in the same way to each of the 1493 /// two parallel dot product computations. 1494 /// \returns A 256-bit vector of [8 x float] containing the two dot products. 1495 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 1496 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1497 (__v8sf)(__m256)(V2), (M)); }) 1498 1499 /* Vector shuffle */ 1500 /// \brief Selects 8 float values from the 256-bit operands of [8 x float], as 1501 /// specified by the immediate value operand. 1502 /// 1503 /// The four selected elements in each operand are copied to the destination 1504 /// according to the bits specified in the immediate operand. The selected 1505 /// elements from the first 256-bit operand are copied to bits [63:0] and 1506 /// bits [191:128] of the destination, and the selected elements from the 1507 /// second 256-bit operand are copied to bits [127:64] and bits [255:192] of 1508 /// the destination. For example, if bits [7:0] of the immediate operand 1509 /// contain a value of 0xFF, the 256-bit destination vector would contain the 1510 /// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]. 1511 /// 1512 /// \headerfile <x86intrin.h> 1513 /// 1514 /// \code 1515 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); 1516 /// \endcode 1517 /// 1518 /// This intrinsic corresponds to the <c> VSHUFPS </c> instruction. 1519 /// 1520 /// \param a 1521 /// A 256-bit vector of [8 x float]. The four selected elements in this 1522 /// operand are copied to bits [63:0] and bits [191:128] in the destination, 1523 /// according to the bits specified in the immediate operand. 1524 /// \param b 1525 /// A 256-bit vector of [8 x float]. The four selected elements in this 1526 /// operand are copied to bits [127:64] and bits [255:192] in the 1527 /// destination, according to the bits specified in the immediate operand. 1528 /// \param mask 1529 /// An immediate value containing an 8-bit value specifying which elements to 1530 /// copy from \a a and \a b \n. 1531 /// Bits [3:0] specify the values copied from operand \a a. \n 1532 /// Bits [7:4] specify the values copied from operand \a b. \n 1533 /// The destinations within the 256-bit destination are assigned values as 1534 /// follows, according to the bit value assignments described below: \n 1535 /// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the 1536 /// destination. \n 1537 /// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the 1538 /// destination. \n 1539 /// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the 1540 /// destination. \n 1541 /// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in 1542 /// the destination. \n 1543 /// Bit value assignments: \n 1544 /// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n 1545 /// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n 1546 /// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n 1547 /// 11: Bits [127:96] and [255:224] are copied from the selected operand. 1548 /// \returns A 256-bit vector of [8 x float] containing the shuffled values. 1549 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 1550 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ 1551 (__v8sf)(__m256)(b), \ 1552 0 + (((mask) >> 0) & 0x3), \ 1553 0 + (((mask) >> 2) & 0x3), \ 1554 8 + (((mask) >> 4) & 0x3), \ 1555 8 + (((mask) >> 6) & 0x3), \ 1556 4 + (((mask) >> 0) & 0x3), \ 1557 4 + (((mask) >> 2) & 0x3), \ 1558 12 + (((mask) >> 4) & 0x3), \ 1559 12 + (((mask) >> 6) & 0x3)); }) 1560 1561 /// \brief Selects four double-precision values from the 256-bit operands of 1562 /// [4 x double], as specified by the immediate value operand. 1563 /// 1564 /// The selected elements from the first 256-bit operand are copied to bits 1565 /// [63:0] and bits [191:128] in the destination, and the selected elements 1566 /// from the second 256-bit operand are copied to bits [127:64] and bits 1567 /// [255:192] in the destination. For example, if bits [3:0] of the immediate 1568 /// operand contain a value of 0xF, the 256-bit destination vector would 1569 /// contain the following values: b[3], a[3], b[1], a[1]. 1570 /// 1571 /// \headerfile <x86intrin.h> 1572 /// 1573 /// \code 1574 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); 1575 /// \endcode 1576 /// 1577 /// This intrinsic corresponds to the <c> VSHUFPD </c> instruction. 1578 /// 1579 /// \param a 1580 /// A 256-bit vector of [4 x double]. 1581 /// \param b 1582 /// A 256-bit vector of [4 x double]. 1583 /// \param mask 1584 /// An immediate value containing 8-bit values specifying which elements to 1585 /// copy from \a a and \a b: \n 1586 /// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the 1587 /// destination. \n 1588 /// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the 1589 /// destination. \n 1590 /// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the 1591 /// destination. \n 1592 /// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the 1593 /// destination. \n 1594 /// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the 1595 /// destination. \n 1596 /// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the 1597 /// destination. \n 1598 /// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the 1599 /// destination. \n 1600 /// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the 1601 /// destination. 1602 /// \returns A 256-bit vector of [4 x double] containing the shuffled values. 1603 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 1604 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ 1605 (__v4df)(__m256d)(b), \ 1606 0 + (((mask) >> 0) & 0x1), \ 1607 4 + (((mask) >> 1) & 0x1), \ 1608 2 + (((mask) >> 2) & 0x1), \ 1609 6 + (((mask) >> 3) & 0x1)); }) 1610 1611 /* Compare */ 1612 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 1613 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 1614 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 1615 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 1616 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 1617 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 1618 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 1619 #define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */ 1620 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1621 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ 1622 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1623 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1624 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1625 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1626 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1627 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1628 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1629 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1630 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1631 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1632 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1633 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1634 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ 1635 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1636 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1637 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ 1638 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1639 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1640 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1641 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1642 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1643 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1644 1645 /// \brief Compares each of the corresponding double-precision values of two 1646 /// 128-bit vectors of [2 x double], using the operation specified by the 1647 /// immediate integer operand. 1648 /// 1649 /// Returns a [2 x double] vector consisting of two doubles corresponding to 1650 /// the two comparison results: zero if the comparison is false, and all 1's 1651 /// if the comparison is true. 1652 /// 1653 /// \headerfile <x86intrin.h> 1654 /// 1655 /// \code 1656 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 1657 /// \endcode 1658 /// 1659 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1660 /// 1661 /// \param a 1662 /// A 128-bit vector of [2 x double]. 1663 /// \param b 1664 /// A 128-bit vector of [2 x double]. 1665 /// \param c 1666 /// An immediate integer operand, with bits [4:0] specifying which comparison 1667 /// operation to use: \n 1668 /// 0x00 : Equal (ordered, non-signaling) 1669 /// 0x01 : Less-than (ordered, signaling) 1670 /// 0x02 : Less-than-or-equal (ordered, signaling) 1671 /// 0x03 : Unordered (non-signaling) 1672 /// 0x04 : Not-equal (unordered, non-signaling) 1673 /// 0x05 : Not-less-than (unordered, signaling) 1674 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1675 /// 0x07 : Ordered (non-signaling) 1676 /// 0x08 : Equal (unordered, non-signaling) 1677 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1678 /// 0x0a : Not-greater-than (unordered, signaling) 1679 /// 0x0b : False (ordered, non-signaling) 1680 /// 0x0c : Not-equal (ordered, non-signaling) 1681 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1682 /// 0x0e : Greater-than (ordered, signaling) 1683 /// 0x0f : True (unordered, non-signaling) 1684 /// 0x10 : Equal (ordered, signaling) 1685 /// 0x11 : Less-than (ordered, non-signaling) 1686 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1687 /// 0x13 : Unordered (signaling) 1688 /// 0x14 : Not-equal (unordered, signaling) 1689 /// 0x15 : Not-less-than (unordered, non-signaling) 1690 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1691 /// 0x17 : Ordered (signaling) 1692 /// 0x18 : Equal (unordered, signaling) 1693 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1694 /// 0x1a : Not-greater-than (unordered, non-signaling) 1695 /// 0x1b : False (ordered, signaling) 1696 /// 0x1c : Not-equal (ordered, signaling) 1697 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1698 /// 0x1e : Greater-than (ordered, non-signaling) 1699 /// 0x1f : True (unordered, signaling) 1700 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 1701 #define _mm_cmp_pd(a, b, c) __extension__ ({ \ 1702 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ 1703 (__v2df)(__m128d)(b), (c)); }) 1704 1705 /// \brief Compares each of the corresponding values of two 128-bit vectors of 1706 /// [4 x float], using the operation specified by the immediate integer 1707 /// operand. 1708 /// 1709 /// Returns a [4 x float] vector consisting of four floats corresponding to 1710 /// the four comparison results: zero if the comparison is false, and all 1's 1711 /// if the comparison is true. 1712 /// 1713 /// \headerfile <x86intrin.h> 1714 /// 1715 /// \code 1716 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); 1717 /// \endcode 1718 /// 1719 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1720 /// 1721 /// \param a 1722 /// A 128-bit vector of [4 x float]. 1723 /// \param b 1724 /// A 128-bit vector of [4 x float]. 1725 /// \param c 1726 /// An immediate integer operand, with bits [4:0] specifying which comparison 1727 /// operation to use: \n 1728 /// 0x00 : Equal (ordered, non-signaling) 1729 /// 0x01 : Less-than (ordered, signaling) 1730 /// 0x02 : Less-than-or-equal (ordered, signaling) 1731 /// 0x03 : Unordered (non-signaling) 1732 /// 0x04 : Not-equal (unordered, non-signaling) 1733 /// 0x05 : Not-less-than (unordered, signaling) 1734 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1735 /// 0x07 : Ordered (non-signaling) 1736 /// 0x08 : Equal (unordered, non-signaling) 1737 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1738 /// 0x0a : Not-greater-than (unordered, signaling) 1739 /// 0x0b : False (ordered, non-signaling) 1740 /// 0x0c : Not-equal (ordered, non-signaling) 1741 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1742 /// 0x0e : Greater-than (ordered, signaling) 1743 /// 0x0f : True (unordered, non-signaling) 1744 /// 0x10 : Equal (ordered, signaling) 1745 /// 0x11 : Less-than (ordered, non-signaling) 1746 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1747 /// 0x13 : Unordered (signaling) 1748 /// 0x14 : Not-equal (unordered, signaling) 1749 /// 0x15 : Not-less-than (unordered, non-signaling) 1750 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1751 /// 0x17 : Ordered (signaling) 1752 /// 0x18 : Equal (unordered, signaling) 1753 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1754 /// 0x1a : Not-greater-than (unordered, non-signaling) 1755 /// 0x1b : False (ordered, signaling) 1756 /// 0x1c : Not-equal (ordered, signaling) 1757 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1758 /// 0x1e : Greater-than (ordered, non-signaling) 1759 /// 0x1f : True (unordered, signaling) 1760 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 1761 #define _mm_cmp_ps(a, b, c) __extension__ ({ \ 1762 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ 1763 (__v4sf)(__m128)(b), (c)); }) 1764 1765 /// \brief Compares each of the corresponding double-precision values of two 1766 /// 256-bit vectors of [4 x double], using the operation specified by the 1767 /// immediate integer operand. 1768 /// 1769 /// Returns a [4 x double] vector consisting of four doubles corresponding to 1770 /// the four comparison results: zero if the comparison is false, and all 1's 1771 /// if the comparison is true. 1772 /// 1773 /// \headerfile <x86intrin.h> 1774 /// 1775 /// \code 1776 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); 1777 /// \endcode 1778 /// 1779 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1780 /// 1781 /// \param a 1782 /// A 256-bit vector of [4 x double]. 1783 /// \param b 1784 /// A 256-bit vector of [4 x double]. 1785 /// \param c 1786 /// An immediate integer operand, with bits [4:0] specifying which comparison 1787 /// operation to use: \n 1788 /// 0x00 : Equal (ordered, non-signaling) 1789 /// 0x01 : Less-than (ordered, signaling) 1790 /// 0x02 : Less-than-or-equal (ordered, signaling) 1791 /// 0x03 : Unordered (non-signaling) 1792 /// 0x04 : Not-equal (unordered, non-signaling) 1793 /// 0x05 : Not-less-than (unordered, signaling) 1794 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1795 /// 0x07 : Ordered (non-signaling) 1796 /// 0x08 : Equal (unordered, non-signaling) 1797 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1798 /// 0x0a : Not-greater-than (unordered, signaling) 1799 /// 0x0b : False (ordered, non-signaling) 1800 /// 0x0c : Not-equal (ordered, non-signaling) 1801 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1802 /// 0x0e : Greater-than (ordered, signaling) 1803 /// 0x0f : True (unordered, non-signaling) 1804 /// 0x10 : Equal (ordered, signaling) 1805 /// 0x11 : Less-than (ordered, non-signaling) 1806 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1807 /// 0x13 : Unordered (signaling) 1808 /// 0x14 : Not-equal (unordered, signaling) 1809 /// 0x15 : Not-less-than (unordered, non-signaling) 1810 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1811 /// 0x17 : Ordered (signaling) 1812 /// 0x18 : Equal (unordered, signaling) 1813 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1814 /// 0x1a : Not-greater-than (unordered, non-signaling) 1815 /// 0x1b : False (ordered, signaling) 1816 /// 0x1c : Not-equal (ordered, signaling) 1817 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1818 /// 0x1e : Greater-than (ordered, non-signaling) 1819 /// 0x1f : True (unordered, signaling) 1820 /// \returns A 256-bit vector of [4 x double] containing the comparison results. 1821 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 1822 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1823 (__v4df)(__m256d)(b), (c)); }) 1824 1825 /// \brief Compares each of the corresponding values of two 256-bit vectors of 1826 /// [8 x float], using the operation specified by the immediate integer 1827 /// operand. 1828 /// 1829 /// Returns a [8 x float] vector consisting of eight floats corresponding to 1830 /// the eight comparison results: zero if the comparison is false, and all 1831 /// 1's if the comparison is true. 1832 /// 1833 /// \headerfile <x86intrin.h> 1834 /// 1835 /// \code 1836 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); 1837 /// \endcode 1838 /// 1839 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1840 /// 1841 /// \param a 1842 /// A 256-bit vector of [8 x float]. 1843 /// \param b 1844 /// A 256-bit vector of [8 x float]. 1845 /// \param c 1846 /// An immediate integer operand, with bits [4:0] specifying which comparison 1847 /// operation to use: \n 1848 /// 0x00 : Equal (ordered, non-signaling) 1849 /// 0x01 : Less-than (ordered, signaling) 1850 /// 0x02 : Less-than-or-equal (ordered, signaling) 1851 /// 0x03 : Unordered (non-signaling) 1852 /// 0x04 : Not-equal (unordered, non-signaling) 1853 /// 0x05 : Not-less-than (unordered, signaling) 1854 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1855 /// 0x07 : Ordered (non-signaling) 1856 /// 0x08 : Equal (unordered, non-signaling) 1857 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1858 /// 0x0a : Not-greater-than (unordered, signaling) 1859 /// 0x0b : False (ordered, non-signaling) 1860 /// 0x0c : Not-equal (ordered, non-signaling) 1861 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1862 /// 0x0e : Greater-than (ordered, signaling) 1863 /// 0x0f : True (unordered, non-signaling) 1864 /// 0x10 : Equal (ordered, signaling) 1865 /// 0x11 : Less-than (ordered, non-signaling) 1866 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1867 /// 0x13 : Unordered (signaling) 1868 /// 0x14 : Not-equal (unordered, signaling) 1869 /// 0x15 : Not-less-than (unordered, non-signaling) 1870 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1871 /// 0x17 : Ordered (signaling) 1872 /// 0x18 : Equal (unordered, signaling) 1873 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1874 /// 0x1a : Not-greater-than (unordered, non-signaling) 1875 /// 0x1b : False (ordered, signaling) 1876 /// 0x1c : Not-equal (ordered, signaling) 1877 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1878 /// 0x1e : Greater-than (ordered, non-signaling) 1879 /// 0x1f : True (unordered, signaling) 1880 /// \returns A 256-bit vector of [8 x float] containing the comparison results. 1881 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 1882 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1883 (__v8sf)(__m256)(b), (c)); }) 1884 1885 /// \brief Compares each of the corresponding scalar double-precision values of 1886 /// two 128-bit vectors of [2 x double], using the operation specified by the 1887 /// immediate integer operand. 1888 /// 1889 /// If the result is true, all 64 bits of the destination vector are set; 1890 /// otherwise they are cleared. 1891 /// 1892 /// \headerfile <x86intrin.h> 1893 /// 1894 /// \code 1895 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 1896 /// \endcode 1897 /// 1898 /// This intrinsic corresponds to the <c> VCMPSD </c> instruction. 1899 /// 1900 /// \param a 1901 /// A 128-bit vector of [2 x double]. 1902 /// \param b 1903 /// A 128-bit vector of [2 x double]. 1904 /// \param c 1905 /// An immediate integer operand, with bits [4:0] specifying which comparison 1906 /// operation to use: \n 1907 /// 0x00 : Equal (ordered, non-signaling) 1908 /// 0x01 : Less-than (ordered, signaling) 1909 /// 0x02 : Less-than-or-equal (ordered, signaling) 1910 /// 0x03 : Unordered (non-signaling) 1911 /// 0x04 : Not-equal (unordered, non-signaling) 1912 /// 0x05 : Not-less-than (unordered, signaling) 1913 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1914 /// 0x07 : Ordered (non-signaling) 1915 /// 0x08 : Equal (unordered, non-signaling) 1916 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1917 /// 0x0a : Not-greater-than (unordered, signaling) 1918 /// 0x0b : False (ordered, non-signaling) 1919 /// 0x0c : Not-equal (ordered, non-signaling) 1920 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1921 /// 0x0e : Greater-than (ordered, signaling) 1922 /// 0x0f : True (unordered, non-signaling) 1923 /// 0x10 : Equal (ordered, signaling) 1924 /// 0x11 : Less-than (ordered, non-signaling) 1925 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1926 /// 0x13 : Unordered (signaling) 1927 /// 0x14 : Not-equal (unordered, signaling) 1928 /// 0x15 : Not-less-than (unordered, non-signaling) 1929 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1930 /// 0x17 : Ordered (signaling) 1931 /// 0x18 : Equal (unordered, signaling) 1932 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1933 /// 0x1a : Not-greater-than (unordered, non-signaling) 1934 /// 0x1b : False (ordered, signaling) 1935 /// 0x1c : Not-equal (ordered, signaling) 1936 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1937 /// 0x1e : Greater-than (ordered, non-signaling) 1938 /// 0x1f : True (unordered, signaling) 1939 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 1940 #define _mm_cmp_sd(a, b, c) __extension__ ({ \ 1941 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ 1942 (__v2df)(__m128d)(b), (c)); }) 1943 1944 /// \brief Compares each of the corresponding scalar values of two 128-bit 1945 /// vectors of [4 x float], using the operation specified by the immediate 1946 /// integer operand. 1947 /// 1948 /// If the result is true, all 32 bits of the destination vector are set; 1949 /// otherwise they are cleared. 1950 /// 1951 /// \headerfile <x86intrin.h> 1952 /// 1953 /// \code 1954 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); 1955 /// \endcode 1956 /// 1957 /// This intrinsic corresponds to the <c> VCMPSS </c> instruction. 1958 /// 1959 /// \param a 1960 /// A 128-bit vector of [4 x float]. 1961 /// \param b 1962 /// A 128-bit vector of [4 x float]. 1963 /// \param c 1964 /// An immediate integer operand, with bits [4:0] specifying which comparison 1965 /// operation to use: \n 1966 /// 0x00 : Equal (ordered, non-signaling) 1967 /// 0x01 : Less-than (ordered, signaling) 1968 /// 0x02 : Less-than-or-equal (ordered, signaling) 1969 /// 0x03 : Unordered (non-signaling) 1970 /// 0x04 : Not-equal (unordered, non-signaling) 1971 /// 0x05 : Not-less-than (unordered, signaling) 1972 /// 0x06 : Not-less-than-or-equal (unordered, signaling) 1973 /// 0x07 : Ordered (non-signaling) 1974 /// 0x08 : Equal (unordered, non-signaling) 1975 /// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1976 /// 0x0a : Not-greater-than (unordered, signaling) 1977 /// 0x0b : False (ordered, non-signaling) 1978 /// 0x0c : Not-equal (ordered, non-signaling) 1979 /// 0x0d : Greater-than-or-equal (ordered, signaling) 1980 /// 0x0e : Greater-than (ordered, signaling) 1981 /// 0x0f : True (unordered, non-signaling) 1982 /// 0x10 : Equal (ordered, signaling) 1983 /// 0x11 : Less-than (ordered, non-signaling) 1984 /// 0x12 : Less-than-or-equal (ordered, non-signaling) 1985 /// 0x13 : Unordered (signaling) 1986 /// 0x14 : Not-equal (unordered, signaling) 1987 /// 0x15 : Not-less-than (unordered, non-signaling) 1988 /// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1989 /// 0x17 : Ordered (signaling) 1990 /// 0x18 : Equal (unordered, signaling) 1991 /// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1992 /// 0x1a : Not-greater-than (unordered, non-signaling) 1993 /// 0x1b : False (ordered, signaling) 1994 /// 0x1c : Not-equal (ordered, signaling) 1995 /// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1996 /// 0x1e : Greater-than (ordered, non-signaling) 1997 /// 0x1f : True (unordered, signaling) 1998 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 1999 #define _mm_cmp_ss(a, b, c) __extension__ ({ \ 2000 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ 2001 (__v4sf)(__m128)(b), (c)); }) 2002 2003 /// \brief Takes a [8 x i32] vector and returns the vector element value 2004 /// indexed by the immediate constant operand. 2005 /// 2006 /// \headerfile <x86intrin.h> 2007 /// 2008 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2009 /// instruction. 2010 /// 2011 /// \param __a 2012 /// A 256-bit vector of [8 x i32]. 2013 /// \param __imm 2014 /// An immediate integer operand with bits [2:0] determining which vector 2015 /// element is extracted and returned. 2016 /// \returns A 32-bit integer containing the extracted 32 bits of extended 2017 /// packed data. 2018 static __inline int __DEFAULT_FN_ATTRS 2019 _mm256_extract_epi32(__m256i __a, const int __imm) 2020 { 2021 __v8si __b = (__v8si)__a; 2022 return __b[__imm & 7]; 2023 } 2024 2025 /// \brief Takes a [16 x i16] vector and returns the vector element value 2026 /// indexed by the immediate constant operand. 2027 /// 2028 /// \headerfile <x86intrin.h> 2029 /// 2030 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2031 /// instruction. 2032 /// 2033 /// \param __a 2034 /// A 256-bit integer vector of [16 x i16]. 2035 /// \param __imm 2036 /// An immediate integer operand with bits [3:0] determining which vector 2037 /// element is extracted and returned. 2038 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended 2039 /// packed data. 2040 static __inline int __DEFAULT_FN_ATTRS 2041 _mm256_extract_epi16(__m256i __a, const int __imm) 2042 { 2043 __v16hi __b = (__v16hi)__a; 2044 return (unsigned short)__b[__imm & 15]; 2045 } 2046 2047 /// \brief Takes a [32 x i8] vector and returns the vector element value 2048 /// indexed by the immediate constant operand. 2049 /// 2050 /// \headerfile <x86intrin.h> 2051 /// 2052 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2053 /// instruction. 2054 /// 2055 /// \param __a 2056 /// A 256-bit integer vector of [32 x i8]. 2057 /// \param __imm 2058 /// An immediate integer operand with bits [4:0] determining which vector 2059 /// element is extracted and returned. 2060 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended 2061 /// packed data. 2062 static __inline int __DEFAULT_FN_ATTRS 2063 _mm256_extract_epi8(__m256i __a, const int __imm) 2064 { 2065 __v32qi __b = (__v32qi)__a; 2066 return (unsigned char)__b[__imm & 31]; 2067 } 2068 2069 #ifdef __x86_64__ 2070 /// \brief Takes a [4 x i64] vector and returns the vector element value 2071 /// indexed by the immediate constant operand. 2072 /// 2073 /// \headerfile <x86intrin.h> 2074 /// 2075 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2076 /// instruction. 2077 /// 2078 /// \param __a 2079 /// A 256-bit integer vector of [4 x i64]. 2080 /// \param __imm 2081 /// An immediate integer operand with bits [1:0] determining which vector 2082 /// element is extracted and returned. 2083 /// \returns A 64-bit integer containing the extracted 64 bits of extended 2084 /// packed data. 2085 static __inline long long __DEFAULT_FN_ATTRS 2086 _mm256_extract_epi64(__m256i __a, const int __imm) 2087 { 2088 __v4di __b = (__v4di)__a; 2089 return __b[__imm & 3]; 2090 } 2091 #endif 2092 2093 /// \brief Takes a [8 x i32] vector and replaces the vector element value 2094 /// indexed by the immediate constant operand by a new value. Returns the 2095 /// modified vector. 2096 /// 2097 /// \headerfile <x86intrin.h> 2098 /// 2099 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2100 /// instruction. 2101 /// 2102 /// \param __a 2103 /// A vector of [8 x i32] to be used by the insert operation. 2104 /// \param __b 2105 /// An integer value. The replacement value for the insert operation. 2106 /// \param __imm 2107 /// An immediate integer specifying the index of the vector element to be 2108 /// replaced. 2109 /// \returns A copy of vector \a __a, after replacing its element indexed by 2110 /// \a __imm with \a __b. 2111 static __inline __m256i __DEFAULT_FN_ATTRS 2112 _mm256_insert_epi32(__m256i __a, int __b, int const __imm) 2113 { 2114 __v8si __c = (__v8si)__a; 2115 __c[__imm & 7] = __b; 2116 return (__m256i)__c; 2117 } 2118 2119 2120 /// \brief Takes a [16 x i16] vector and replaces the vector element value 2121 /// indexed by the immediate constant operand with a new value. Returns the 2122 /// modified vector. 2123 /// 2124 /// \headerfile <x86intrin.h> 2125 /// 2126 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2127 /// instruction. 2128 /// 2129 /// \param __a 2130 /// A vector of [16 x i16] to be used by the insert operation. 2131 /// \param __b 2132 /// An i16 integer value. The replacement value for the insert operation. 2133 /// \param __imm 2134 /// An immediate integer specifying the index of the vector element to be 2135 /// replaced. 2136 /// \returns A copy of vector \a __a, after replacing its element indexed by 2137 /// \a __imm with \a __b. 2138 static __inline __m256i __DEFAULT_FN_ATTRS 2139 _mm256_insert_epi16(__m256i __a, int __b, int const __imm) 2140 { 2141 __v16hi __c = (__v16hi)__a; 2142 __c[__imm & 15] = __b; 2143 return (__m256i)__c; 2144 } 2145 2146 /// \brief Takes a [32 x i8] vector and replaces the vector element value 2147 /// indexed by the immediate constant operand with a new value. Returns the 2148 /// modified vector. 2149 /// 2150 /// \headerfile <x86intrin.h> 2151 /// 2152 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2153 /// instruction. 2154 /// 2155 /// \param __a 2156 /// A vector of [32 x i8] to be used by the insert operation. 2157 /// \param __b 2158 /// An i8 integer value. The replacement value for the insert operation. 2159 /// \param __imm 2160 /// An immediate integer specifying the index of the vector element to be 2161 /// replaced. 2162 /// \returns A copy of vector \a __a, after replacing its element indexed by 2163 /// \a __imm with \a __b. 2164 static __inline __m256i __DEFAULT_FN_ATTRS 2165 _mm256_insert_epi8(__m256i __a, int __b, int const __imm) 2166 { 2167 __v32qi __c = (__v32qi)__a; 2168 __c[__imm & 31] = __b; 2169 return (__m256i)__c; 2170 } 2171 2172 #ifdef __x86_64__ 2173 /// \brief Takes a [4 x i64] vector and replaces the vector element value 2174 /// indexed by the immediate constant operand with a new value. Returns the 2175 /// modified vector. 2176 /// 2177 /// \headerfile <x86intrin.h> 2178 /// 2179 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2180 /// instruction. 2181 /// 2182 /// \param __a 2183 /// A vector of [4 x i64] to be used by the insert operation. 2184 /// \param __b 2185 /// A 64-bit integer value. The replacement value for the insert operation. 2186 /// \param __imm 2187 /// An immediate integer specifying the index of the vector element to be 2188 /// replaced. 2189 /// \returns A copy of vector \a __a, after replacing its element indexed by 2190 /// \a __imm with \a __b. 2191 static __inline __m256i __DEFAULT_FN_ATTRS 2192 _mm256_insert_epi64(__m256i __a, long long __b, int const __imm) 2193 { 2194 __v4di __c = (__v4di)__a; 2195 __c[__imm & 3] = __b; 2196 return (__m256i)__c; 2197 } 2198 #endif 2199 2200 /* Conversion */ 2201 /// \brief Converts a vector of [4 x i32] into a vector of [4 x double]. 2202 /// 2203 /// \headerfile <x86intrin.h> 2204 /// 2205 /// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction. 2206 /// 2207 /// \param __a 2208 /// A 128-bit integer vector of [4 x i32]. 2209 /// \returns A 256-bit vector of [4 x double] containing the converted values. 2210 static __inline __m256d __DEFAULT_FN_ATTRS 2211 _mm256_cvtepi32_pd(__m128i __a) 2212 { 2213 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); 2214 } 2215 2216 /// \brief Converts a vector of [8 x i32] into a vector of [8 x float]. 2217 /// 2218 /// \headerfile <x86intrin.h> 2219 /// 2220 /// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction. 2221 /// 2222 /// \param __a 2223 /// A 256-bit integer vector. 2224 /// \returns A 256-bit vector of [8 x float] containing the converted values. 2225 static __inline __m256 __DEFAULT_FN_ATTRS 2226 _mm256_cvtepi32_ps(__m256i __a) 2227 { 2228 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); 2229 } 2230 2231 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2232 /// [4 x float]. 2233 /// 2234 /// \headerfile <x86intrin.h> 2235 /// 2236 /// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction. 2237 /// 2238 /// \param __a 2239 /// A 256-bit vector of [4 x double]. 2240 /// \returns A 128-bit vector of [4 x float] containing the converted values. 2241 static __inline __m128 __DEFAULT_FN_ATTRS 2242 _mm256_cvtpd_ps(__m256d __a) 2243 { 2244 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 2245 } 2246 2247 /// \brief Converts a vector of [8 x float] into a vector of [8 x i32]. 2248 /// 2249 /// \headerfile <x86intrin.h> 2250 /// 2251 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction. 2252 /// 2253 /// \param __a 2254 /// A 256-bit vector of [8 x float]. 2255 /// \returns A 256-bit integer vector containing the converted values. 2256 static __inline __m256i __DEFAULT_FN_ATTRS 2257 _mm256_cvtps_epi32(__m256 __a) 2258 { 2259 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 2260 } 2261 2262 /// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 2263 /// x double]. 2264 /// 2265 /// \headerfile <x86intrin.h> 2266 /// 2267 /// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction. 2268 /// 2269 /// \param __a 2270 /// A 128-bit vector of [4 x float]. 2271 /// \returns A 256-bit vector of [4 x double] containing the converted values. 2272 static __inline __m256d __DEFAULT_FN_ATTRS 2273 _mm256_cvtps_pd(__m128 __a) 2274 { 2275 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); 2276 } 2277 2278 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2279 /// x i32], truncating the result by rounding towards zero when it is 2280 /// inexact. 2281 /// 2282 /// \headerfile <x86intrin.h> 2283 /// 2284 /// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction. 2285 /// 2286 /// \param __a 2287 /// A 256-bit vector of [4 x double]. 2288 /// \returns A 128-bit integer vector containing the converted values. 2289 static __inline __m128i __DEFAULT_FN_ATTRS 2290 _mm256_cvttpd_epi32(__m256d __a) 2291 { 2292 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 2293 } 2294 2295 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2296 /// x i32]. When a conversion is inexact, the value returned is rounded 2297 /// according to the rounding control bits in the MXCSR register. 2298 /// 2299 /// \headerfile <x86intrin.h> 2300 /// 2301 /// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction. 2302 /// 2303 /// \param __a 2304 /// A 256-bit vector of [4 x double]. 2305 /// \returns A 128-bit integer vector containing the converted values. 2306 static __inline __m128i __DEFAULT_FN_ATTRS 2307 _mm256_cvtpd_epi32(__m256d __a) 2308 { 2309 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 2310 } 2311 2312 /// \brief Converts a vector of [8 x float] into a vector of [8 x i32], 2313 /// truncating the result by rounding towards zero when it is inexact. 2314 /// 2315 /// \headerfile <x86intrin.h> 2316 /// 2317 /// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction. 2318 /// 2319 /// \param __a 2320 /// A 256-bit vector of [8 x float]. 2321 /// \returns A 256-bit integer vector containing the converted values. 2322 static __inline __m256i __DEFAULT_FN_ATTRS 2323 _mm256_cvttps_epi32(__m256 __a) 2324 { 2325 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 2326 } 2327 2328 /// \brief Returns the first element of the input vector of [4 x double]. 2329 /// 2330 /// \headerfile <avxintrin.h> 2331 /// 2332 /// This intrinsic is a utility function and does not correspond to a specific 2333 /// instruction. 2334 /// 2335 /// \param __a 2336 /// A 256-bit vector of [4 x double]. 2337 /// \returns A 64 bit double containing the first element of the input vector. 2338 static __inline double __DEFAULT_FN_ATTRS 2339 _mm256_cvtsd_f64(__m256d __a) 2340 { 2341 return __a[0]; 2342 } 2343 2344 /// \brief Returns the first element of the input vector of [8 x i32]. 2345 /// 2346 /// \headerfile <avxintrin.h> 2347 /// 2348 /// This intrinsic is a utility function and does not correspond to a specific 2349 /// instruction. 2350 /// 2351 /// \param __a 2352 /// A 256-bit vector of [8 x i32]. 2353 /// \returns A 32 bit integer containing the first element of the input vector. 2354 static __inline int __DEFAULT_FN_ATTRS 2355 _mm256_cvtsi256_si32(__m256i __a) 2356 { 2357 __v8si __b = (__v8si)__a; 2358 return __b[0]; 2359 } 2360 2361 /// \brief Returns the first element of the input vector of [8 x float]. 2362 /// 2363 /// \headerfile <avxintrin.h> 2364 /// 2365 /// This intrinsic is a utility function and does not correspond to a specific 2366 /// instruction. 2367 /// 2368 /// \param __a 2369 /// A 256-bit vector of [8 x float]. 2370 /// \returns A 32 bit float containing the first element of the input vector. 2371 static __inline float __DEFAULT_FN_ATTRS 2372 _mm256_cvtss_f32(__m256 __a) 2373 { 2374 return __a[0]; 2375 } 2376 2377 /* Vector replicate */ 2378 /// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit 2379 /// vector of [8 x float] to float values in a 256-bit vector of 2380 /// [8 x float]. 2381 /// 2382 /// \headerfile <x86intrin.h> 2383 /// 2384 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 2385 /// 2386 /// \param __a 2387 /// A 256-bit vector of [8 x float]. \n 2388 /// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of 2389 /// the return value. \n 2390 /// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of 2391 /// the return value. \n 2392 /// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the 2393 /// return value. \n 2394 /// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the 2395 /// return value. 2396 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2397 /// values. 2398 static __inline __m256 __DEFAULT_FN_ATTRS 2399 _mm256_movehdup_ps(__m256 __a) 2400 { 2401 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); 2402 } 2403 2404 /// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit 2405 /// vector of [8 x float] to float values in a 256-bit vector of [8 x float]. 2406 /// 2407 /// \headerfile <x86intrin.h> 2408 /// 2409 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 2410 /// 2411 /// \param __a 2412 /// A 256-bit vector of [8 x float]. \n 2413 /// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of 2414 /// the return value. \n 2415 /// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of 2416 /// the return value. \n 2417 /// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the 2418 /// return value. \n 2419 /// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the 2420 /// return value. 2421 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2422 /// values. 2423 static __inline __m256 __DEFAULT_FN_ATTRS 2424 _mm256_moveldup_ps(__m256 __a) 2425 { 2426 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); 2427 } 2428 2429 /// \brief Moves and duplicates double-precision floating point values from a 2430 /// 256-bit vector of [4 x double] to double-precision values in a 256-bit 2431 /// vector of [4 x double]. 2432 /// 2433 /// \headerfile <x86intrin.h> 2434 /// 2435 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 2436 /// 2437 /// \param __a 2438 /// A 256-bit vector of [4 x double]. \n 2439 /// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the 2440 /// return value. \n 2441 /// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of 2442 /// the return value. 2443 /// \returns A 256-bit vector of [4 x double] containing the moved and 2444 /// duplicated values. 2445 static __inline __m256d __DEFAULT_FN_ATTRS 2446 _mm256_movedup_pd(__m256d __a) 2447 { 2448 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); 2449 } 2450 2451 /* Unpack and Interleave */ 2452 /// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of 2453 /// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2454 /// 2455 /// \headerfile <x86intrin.h> 2456 /// 2457 /// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction. 2458 /// 2459 /// \param __a 2460 /// A 256-bit floating-point vector of [4 x double]. \n 2461 /// Bits [127:64] are written to bits [63:0] of the return value. \n 2462 /// Bits [255:192] are written to bits [191:128] of the return value. \n 2463 /// \param __b 2464 /// A 256-bit floating-point vector of [4 x double]. \n 2465 /// Bits [127:64] are written to bits [127:64] of the return value. \n 2466 /// Bits [255:192] are written to bits [255:192] of the return value. \n 2467 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2468 static __inline __m256d __DEFAULT_FN_ATTRS 2469 _mm256_unpackhi_pd(__m256d __a, __m256d __b) 2470 { 2471 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); 2472 } 2473 2474 /// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of 2475 /// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2476 /// 2477 /// \headerfile <x86intrin.h> 2478 /// 2479 /// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction. 2480 /// 2481 /// \param __a 2482 /// A 256-bit floating-point vector of [4 x double]. \n 2483 /// Bits [63:0] are written to bits [63:0] of the return value. \n 2484 /// Bits [191:128] are written to bits [191:128] of the return value. 2485 /// \param __b 2486 /// A 256-bit floating-point vector of [4 x double]. \n 2487 /// Bits [63:0] are written to bits [127:64] of the return value. \n 2488 /// Bits [191:128] are written to bits [255:192] of the return value. \n 2489 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2490 static __inline __m256d __DEFAULT_FN_ATTRS 2491 _mm256_unpacklo_pd(__m256d __a, __m256d __b) 2492 { 2493 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); 2494 } 2495 2496 /// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the 2497 /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2498 /// vector of [8 x float]. 2499 /// 2500 /// \headerfile <x86intrin.h> 2501 /// 2502 /// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction. 2503 /// 2504 /// \param __a 2505 /// A 256-bit vector of [8 x float]. \n 2506 /// Bits [95:64] are written to bits [31:0] of the return value. \n 2507 /// Bits [127:96] are written to bits [95:64] of the return value. \n 2508 /// Bits [223:192] are written to bits [159:128] of the return value. \n 2509 /// Bits [255:224] are written to bits [223:192] of the return value. 2510 /// \param __b 2511 /// A 256-bit vector of [8 x float]. \n 2512 /// Bits [95:64] are written to bits [63:32] of the return value. \n 2513 /// Bits [127:96] are written to bits [127:96] of the return value. \n 2514 /// Bits [223:192] are written to bits [191:160] of the return value. \n 2515 /// Bits [255:224] are written to bits [255:224] of the return value. 2516 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2517 static __inline __m256 __DEFAULT_FN_ATTRS 2518 _mm256_unpackhi_ps(__m256 __a, __m256 __b) 2519 { 2520 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 2521 } 2522 2523 /// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the 2524 /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2525 /// vector of [8 x float]. 2526 /// 2527 /// \headerfile <x86intrin.h> 2528 /// 2529 /// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction. 2530 /// 2531 /// \param __a 2532 /// A 256-bit vector of [8 x float]. \n 2533 /// Bits [31:0] are written to bits [31:0] of the return value. \n 2534 /// Bits [63:32] are written to bits [95:64] of the return value. \n 2535 /// Bits [159:128] are written to bits [159:128] of the return value. \n 2536 /// Bits [191:160] are written to bits [223:192] of the return value. 2537 /// \param __b 2538 /// A 256-bit vector of [8 x float]. \n 2539 /// Bits [31:0] are written to bits [63:32] of the return value. \n 2540 /// Bits [63:32] are written to bits [127:96] of the return value. \n 2541 /// Bits [159:128] are written to bits [191:160] of the return value. \n 2542 /// Bits [191:160] are written to bits [255:224] of the return value. 2543 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2544 static __inline __m256 __DEFAULT_FN_ATTRS 2545 _mm256_unpacklo_ps(__m256 __a, __m256 __b) 2546 { 2547 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 2548 } 2549 2550 /* Bit Test */ 2551 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2552 /// element-by-element comparison of the double-precision element in the 2553 /// first source vector and the corresponding element in the second source 2554 /// vector. 2555 /// 2556 /// The EFLAGS register is updated as follows: \n 2557 /// If there is at least one pair of double-precision elements where the 2558 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2559 /// ZF flag is set to 1. \n 2560 /// If there is at least one pair of double-precision elements where the 2561 /// sign-bit of the first element is 0 and the sign-bit of the second element 2562 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2563 /// This intrinsic returns the value of the ZF flag. 2564 /// 2565 /// \headerfile <x86intrin.h> 2566 /// 2567 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2568 /// 2569 /// \param __a 2570 /// A 128-bit vector of [2 x double]. 2571 /// \param __b 2572 /// A 128-bit vector of [2 x double]. 2573 /// \returns the ZF flag in the EFLAGS register. 2574 static __inline int __DEFAULT_FN_ATTRS 2575 _mm_testz_pd(__m128d __a, __m128d __b) 2576 { 2577 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 2578 } 2579 2580 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2581 /// element-by-element comparison of the double-precision element in the 2582 /// first source vector and the corresponding element in the second source 2583 /// vector. 2584 /// 2585 /// The EFLAGS register is updated as follows: \n 2586 /// If there is at least one pair of double-precision elements where the 2587 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2588 /// ZF flag is set to 1. \n 2589 /// If there is at least one pair of double-precision elements where the 2590 /// sign-bit of the first element is 0 and the sign-bit of the second element 2591 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2592 /// This intrinsic returns the value of the CF flag. 2593 /// 2594 /// \headerfile <x86intrin.h> 2595 /// 2596 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2597 /// 2598 /// \param __a 2599 /// A 128-bit vector of [2 x double]. 2600 /// \param __b 2601 /// A 128-bit vector of [2 x double]. 2602 /// \returns the CF flag in the EFLAGS register. 2603 static __inline int __DEFAULT_FN_ATTRS 2604 _mm_testc_pd(__m128d __a, __m128d __b) 2605 { 2606 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 2607 } 2608 2609 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2610 /// element-by-element comparison of the double-precision element in the 2611 /// first source vector and the corresponding element in the second source 2612 /// vector. 2613 /// 2614 /// The EFLAGS register is updated as follows: \n 2615 /// If there is at least one pair of double-precision elements where the 2616 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2617 /// ZF flag is set to 1. \n 2618 /// If there is at least one pair of double-precision elements where the 2619 /// sign-bit of the first element is 0 and the sign-bit of the second element 2620 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2621 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2622 /// otherwise it returns 0. 2623 /// 2624 /// \headerfile <x86intrin.h> 2625 /// 2626 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2627 /// 2628 /// \param __a 2629 /// A 128-bit vector of [2 x double]. 2630 /// \param __b 2631 /// A 128-bit vector of [2 x double]. 2632 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2633 static __inline int __DEFAULT_FN_ATTRS 2634 _mm_testnzc_pd(__m128d __a, __m128d __b) 2635 { 2636 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 2637 } 2638 2639 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2640 /// element-by-element comparison of the single-precision element in the 2641 /// first source vector and the corresponding element in the second source 2642 /// vector. 2643 /// 2644 /// The EFLAGS register is updated as follows: \n 2645 /// If there is at least one pair of single-precision elements where the 2646 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2647 /// ZF flag is set to 1. \n 2648 /// If there is at least one pair of single-precision elements where the 2649 /// sign-bit of the first element is 0 and the sign-bit of the second element 2650 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2651 /// This intrinsic returns the value of the ZF flag. 2652 /// 2653 /// \headerfile <x86intrin.h> 2654 /// 2655 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2656 /// 2657 /// \param __a 2658 /// A 128-bit vector of [4 x float]. 2659 /// \param __b 2660 /// A 128-bit vector of [4 x float]. 2661 /// \returns the ZF flag. 2662 static __inline int __DEFAULT_FN_ATTRS 2663 _mm_testz_ps(__m128 __a, __m128 __b) 2664 { 2665 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 2666 } 2667 2668 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2669 /// element-by-element comparison of the single-precision element in the 2670 /// first source vector and the corresponding element in the second source 2671 /// vector. 2672 /// 2673 /// The EFLAGS register is updated as follows: \n 2674 /// If there is at least one pair of single-precision elements where the 2675 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2676 /// ZF flag is set to 1. \n 2677 /// If there is at least one pair of single-precision elements where the 2678 /// sign-bit of the first element is 0 and the sign-bit of the second element 2679 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2680 /// This intrinsic returns the value of the CF flag. 2681 /// 2682 /// \headerfile <x86intrin.h> 2683 /// 2684 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2685 /// 2686 /// \param __a 2687 /// A 128-bit vector of [4 x float]. 2688 /// \param __b 2689 /// A 128-bit vector of [4 x float]. 2690 /// \returns the CF flag. 2691 static __inline int __DEFAULT_FN_ATTRS 2692 _mm_testc_ps(__m128 __a, __m128 __b) 2693 { 2694 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 2695 } 2696 2697 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2698 /// element-by-element comparison of the single-precision element in the 2699 /// first source vector and the corresponding element in the second source 2700 /// vector. 2701 /// 2702 /// The EFLAGS register is updated as follows: \n 2703 /// If there is at least one pair of single-precision elements where the 2704 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2705 /// ZF flag is set to 1. \n 2706 /// If there is at least one pair of single-precision elements where the 2707 /// sign-bit of the first element is 0 and the sign-bit of the second element 2708 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2709 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2710 /// otherwise it returns 0. 2711 /// 2712 /// \headerfile <x86intrin.h> 2713 /// 2714 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2715 /// 2716 /// \param __a 2717 /// A 128-bit vector of [4 x float]. 2718 /// \param __b 2719 /// A 128-bit vector of [4 x float]. 2720 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2721 static __inline int __DEFAULT_FN_ATTRS 2722 _mm_testnzc_ps(__m128 __a, __m128 __b) 2723 { 2724 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 2725 } 2726 2727 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2728 /// element-by-element comparison of the double-precision elements in the 2729 /// first source vector and the corresponding elements in the second source 2730 /// vector. 2731 /// 2732 /// The EFLAGS register is updated as follows: \n 2733 /// If there is at least one pair of double-precision elements where the 2734 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2735 /// ZF flag is set to 1. \n 2736 /// If there is at least one pair of double-precision elements where the 2737 /// sign-bit of the first element is 0 and the sign-bit of the second element 2738 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2739 /// This intrinsic returns the value of the ZF flag. 2740 /// 2741 /// \headerfile <x86intrin.h> 2742 /// 2743 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2744 /// 2745 /// \param __a 2746 /// A 256-bit vector of [4 x double]. 2747 /// \param __b 2748 /// A 256-bit vector of [4 x double]. 2749 /// \returns the ZF flag. 2750 static __inline int __DEFAULT_FN_ATTRS 2751 _mm256_testz_pd(__m256d __a, __m256d __b) 2752 { 2753 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 2754 } 2755 2756 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2757 /// element-by-element comparison of the double-precision elements in the 2758 /// first source vector and the corresponding elements in the second source 2759 /// vector. 2760 /// 2761 /// The EFLAGS register is updated as follows: \n 2762 /// If there is at least one pair of double-precision elements where the 2763 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2764 /// ZF flag is set to 1. \n 2765 /// If there is at least one pair of double-precision elements where the 2766 /// sign-bit of the first element is 0 and the sign-bit of the second element 2767 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2768 /// This intrinsic returns the value of the CF flag. 2769 /// 2770 /// \headerfile <x86intrin.h> 2771 /// 2772 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2773 /// 2774 /// \param __a 2775 /// A 256-bit vector of [4 x double]. 2776 /// \param __b 2777 /// A 256-bit vector of [4 x double]. 2778 /// \returns the CF flag. 2779 static __inline int __DEFAULT_FN_ATTRS 2780 _mm256_testc_pd(__m256d __a, __m256d __b) 2781 { 2782 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 2783 } 2784 2785 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2786 /// element-by-element comparison of the double-precision elements in the 2787 /// first source vector and the corresponding elements in the second source 2788 /// vector. 2789 /// 2790 /// The EFLAGS register is updated as follows: \n 2791 /// If there is at least one pair of double-precision elements where the 2792 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2793 /// ZF flag is set to 1. \n 2794 /// If there is at least one pair of double-precision elements where the 2795 /// sign-bit of the first element is 0 and the sign-bit of the second element 2796 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2797 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2798 /// otherwise it returns 0. 2799 /// 2800 /// \headerfile <x86intrin.h> 2801 /// 2802 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2803 /// 2804 /// \param __a 2805 /// A 256-bit vector of [4 x double]. 2806 /// \param __b 2807 /// A 256-bit vector of [4 x double]. 2808 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2809 static __inline int __DEFAULT_FN_ATTRS 2810 _mm256_testnzc_pd(__m256d __a, __m256d __b) 2811 { 2812 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 2813 } 2814 2815 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2816 /// element-by-element comparison of the single-precision element in the 2817 /// first source vector and the corresponding element in the second source 2818 /// vector. 2819 /// 2820 /// The EFLAGS register is updated as follows: \n 2821 /// If there is at least one pair of single-precision elements where the 2822 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2823 /// ZF flag is set to 1. \n 2824 /// If there is at least one pair of single-precision elements where the 2825 /// sign-bit of the first element is 0 and the sign-bit of the second element 2826 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2827 /// This intrinsic returns the value of the ZF flag. 2828 /// 2829 /// \headerfile <x86intrin.h> 2830 /// 2831 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2832 /// 2833 /// \param __a 2834 /// A 256-bit vector of [8 x float]. 2835 /// \param __b 2836 /// A 256-bit vector of [8 x float]. 2837 /// \returns the ZF flag. 2838 static __inline int __DEFAULT_FN_ATTRS 2839 _mm256_testz_ps(__m256 __a, __m256 __b) 2840 { 2841 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 2842 } 2843 2844 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2845 /// element-by-element comparison of the single-precision element in the 2846 /// first source vector and the corresponding element in the second source 2847 /// vector. 2848 /// 2849 /// The EFLAGS register is updated as follows: \n 2850 /// If there is at least one pair of single-precision elements where the 2851 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2852 /// ZF flag is set to 1. \n 2853 /// If there is at least one pair of single-precision elements where the 2854 /// sign-bit of the first element is 0 and the sign-bit of the second element 2855 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2856 /// This intrinsic returns the value of the CF flag. 2857 /// 2858 /// \headerfile <x86intrin.h> 2859 /// 2860 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2861 /// 2862 /// \param __a 2863 /// A 256-bit vector of [8 x float]. 2864 /// \param __b 2865 /// A 256-bit vector of [8 x float]. 2866 /// \returns the CF flag. 2867 static __inline int __DEFAULT_FN_ATTRS 2868 _mm256_testc_ps(__m256 __a, __m256 __b) 2869 { 2870 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 2871 } 2872 2873 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2874 /// element-by-element comparison of the single-precision elements in the 2875 /// first source vector and the corresponding elements in the second source 2876 /// vector. 2877 /// 2878 /// The EFLAGS register is updated as follows: \n 2879 /// If there is at least one pair of single-precision elements where the 2880 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2881 /// ZF flag is set to 1. \n 2882 /// If there is at least one pair of single-precision elements where the 2883 /// sign-bit of the first element is 0 and the sign-bit of the second element 2884 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2885 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2886 /// otherwise it returns 0. 2887 /// 2888 /// \headerfile <x86intrin.h> 2889 /// 2890 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2891 /// 2892 /// \param __a 2893 /// A 256-bit vector of [8 x float]. 2894 /// \param __b 2895 /// A 256-bit vector of [8 x float]. 2896 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2897 static __inline int __DEFAULT_FN_ATTRS 2898 _mm256_testnzc_ps(__m256 __a, __m256 __b) 2899 { 2900 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 2901 } 2902 2903 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2904 /// of the two source vectors. 2905 /// 2906 /// The EFLAGS register is updated as follows: \n 2907 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2908 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2909 /// If there is at least one pair of bits where the bit from the first source 2910 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2911 /// is set to 0. Otherwise the CF flag is set to 1. \n 2912 /// This intrinsic returns the value of the ZF flag. 2913 /// 2914 /// \headerfile <x86intrin.h> 2915 /// 2916 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2917 /// 2918 /// \param __a 2919 /// A 256-bit integer vector. 2920 /// \param __b 2921 /// A 256-bit integer vector. 2922 /// \returns the ZF flag. 2923 static __inline int __DEFAULT_FN_ATTRS 2924 _mm256_testz_si256(__m256i __a, __m256i __b) 2925 { 2926 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 2927 } 2928 2929 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2930 /// of the two source vectors. 2931 /// 2932 /// The EFLAGS register is updated as follows: \n 2933 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2934 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2935 /// If there is at least one pair of bits where the bit from the first source 2936 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2937 /// is set to 0. Otherwise the CF flag is set to 1. \n 2938 /// This intrinsic returns the value of the CF flag. 2939 /// 2940 /// \headerfile <x86intrin.h> 2941 /// 2942 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2943 /// 2944 /// \param __a 2945 /// A 256-bit integer vector. 2946 /// \param __b 2947 /// A 256-bit integer vector. 2948 /// \returns the CF flag. 2949 static __inline int __DEFAULT_FN_ATTRS 2950 _mm256_testc_si256(__m256i __a, __m256i __b) 2951 { 2952 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 2953 } 2954 2955 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2956 /// of the two source vectors. 2957 /// 2958 /// The EFLAGS register is updated as follows: \n 2959 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2960 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2961 /// If there is at least one pair of bits where the bit from the first source 2962 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2963 /// is set to 0. Otherwise the CF flag is set to 1. \n 2964 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2965 /// otherwise it returns 0. 2966 /// 2967 /// \headerfile <x86intrin.h> 2968 /// 2969 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2970 /// 2971 /// \param __a 2972 /// A 256-bit integer vector. 2973 /// \param __b 2974 /// A 256-bit integer vector. 2975 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2976 static __inline int __DEFAULT_FN_ATTRS 2977 _mm256_testnzc_si256(__m256i __a, __m256i __b) 2978 { 2979 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 2980 } 2981 2982 /* Vector extract sign mask */ 2983 /// \brief Extracts the sign bits of double-precision floating point elements 2984 /// in a 256-bit vector of [4 x double] and writes them to the lower order 2985 /// bits of the return value. 2986 /// 2987 /// \headerfile <x86intrin.h> 2988 /// 2989 /// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction. 2990 /// 2991 /// \param __a 2992 /// A 256-bit vector of [4 x double] containing the double-precision 2993 /// floating point values with sign bits to be extracted. 2994 /// \returns The sign bits from the operand, written to bits [3:0]. 2995 static __inline int __DEFAULT_FN_ATTRS 2996 _mm256_movemask_pd(__m256d __a) 2997 { 2998 return __builtin_ia32_movmskpd256((__v4df)__a); 2999 } 3000 3001 /// \brief Extracts the sign bits of double-precision floating point elements 3002 /// in a 256-bit vector of [8 x float] and writes them to the lower order 3003 /// bits of the return value. 3004 /// 3005 /// \headerfile <x86intrin.h> 3006 /// 3007 /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. 3008 /// 3009 /// \param __a 3010 /// A 256-bit vector of [8 x float] containing the double-precision floating 3011 /// point values with sign bits to be extracted. 3012 /// \returns The sign bits from the operand, written to bits [7:0]. 3013 static __inline int __DEFAULT_FN_ATTRS 3014 _mm256_movemask_ps(__m256 __a) 3015 { 3016 return __builtin_ia32_movmskps256((__v8sf)__a); 3017 } 3018 3019 /* Vector __zero */ 3020 /// \brief Zeroes the contents of all XMM or YMM registers. 3021 /// 3022 /// \headerfile <x86intrin.h> 3023 /// 3024 /// This intrinsic corresponds to the <c> VZEROALL </c> instruction. 3025 static __inline void __DEFAULT_FN_ATTRS 3026 _mm256_zeroall(void) 3027 { 3028 __builtin_ia32_vzeroall(); 3029 } 3030 3031 /// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers. 3032 /// 3033 /// \headerfile <x86intrin.h> 3034 /// 3035 /// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. 3036 static __inline void __DEFAULT_FN_ATTRS 3037 _mm256_zeroupper(void) 3038 { 3039 __builtin_ia32_vzeroupper(); 3040 } 3041 3042 /* Vector load with broadcast */ 3043 /// \brief Loads a scalar single-precision floating point value from the 3044 /// specified address pointed to by \a __a and broadcasts it to the elements 3045 /// of a [4 x float] vector. 3046 /// 3047 /// \headerfile <x86intrin.h> 3048 /// 3049 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3050 /// 3051 /// \param __a 3052 /// The single-precision floating point value to be broadcast. 3053 /// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set 3054 /// equal to the broadcast value. 3055 static __inline __m128 __DEFAULT_FN_ATTRS 3056 _mm_broadcast_ss(float const *__a) 3057 { 3058 float __f = *__a; 3059 return (__m128)(__v4sf){ __f, __f, __f, __f }; 3060 } 3061 3062 /// \brief Loads a scalar double-precision floating point value from the 3063 /// specified address pointed to by \a __a and broadcasts it to the elements 3064 /// of a [4 x double] vector. 3065 /// 3066 /// \headerfile <x86intrin.h> 3067 /// 3068 /// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction. 3069 /// 3070 /// \param __a 3071 /// The double-precision floating point value to be broadcast. 3072 /// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set 3073 /// equal to the broadcast value. 3074 static __inline __m256d __DEFAULT_FN_ATTRS 3075 _mm256_broadcast_sd(double const *__a) 3076 { 3077 double __d = *__a; 3078 return (__m256d)(__v4df){ __d, __d, __d, __d }; 3079 } 3080 3081 /// \brief Loads a scalar single-precision floating point value from the 3082 /// specified address pointed to by \a __a and broadcasts it to the elements 3083 /// of a [8 x float] vector. 3084 /// 3085 /// \headerfile <x86intrin.h> 3086 /// 3087 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3088 /// 3089 /// \param __a 3090 /// The single-precision floating point value to be broadcast. 3091 /// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set 3092 /// equal to the broadcast value. 3093 static __inline __m256 __DEFAULT_FN_ATTRS 3094 _mm256_broadcast_ss(float const *__a) 3095 { 3096 float __f = *__a; 3097 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 3098 } 3099 3100 /// \brief Loads the data from a 128-bit vector of [2 x double] from the 3101 /// specified address pointed to by \a __a and broadcasts it to 128-bit 3102 /// elements in a 256-bit vector of [4 x double]. 3103 /// 3104 /// \headerfile <x86intrin.h> 3105 /// 3106 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3107 /// 3108 /// \param __a 3109 /// The 128-bit vector of [2 x double] to be broadcast. 3110 /// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set 3111 /// equal to the broadcast value. 3112 static __inline __m256d __DEFAULT_FN_ATTRS 3113 _mm256_broadcast_pd(__m128d const *__a) 3114 { 3115 return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a); 3116 } 3117 3118 /// \brief Loads the data from a 128-bit vector of [4 x float] from the 3119 /// specified address pointed to by \a __a and broadcasts it to 128-bit 3120 /// elements in a 256-bit vector of [8 x float]. 3121 /// 3122 /// \headerfile <x86intrin.h> 3123 /// 3124 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3125 /// 3126 /// \param __a 3127 /// The 128-bit vector of [4 x float] to be broadcast. 3128 /// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set 3129 /// equal to the broadcast value. 3130 static __inline __m256 __DEFAULT_FN_ATTRS 3131 _mm256_broadcast_ps(__m128 const *__a) 3132 { 3133 return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a); 3134 } 3135 3136 /* SIMD load ops */ 3137 /// \brief Loads 4 double-precision floating point values from a 32-byte aligned 3138 /// memory location pointed to by \a __p into a vector of [4 x double]. 3139 /// 3140 /// \headerfile <x86intrin.h> 3141 /// 3142 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3143 /// 3144 /// \param __p 3145 /// A 32-byte aligned pointer to a memory location containing 3146 /// double-precision floating point values. 3147 /// \returns A 256-bit vector of [4 x double] containing the moved values. 3148 static __inline __m256d __DEFAULT_FN_ATTRS 3149 _mm256_load_pd(double const *__p) 3150 { 3151 return *(__m256d *)__p; 3152 } 3153 3154 /// \brief Loads 8 single-precision floating point values from a 32-byte aligned 3155 /// memory location pointed to by \a __p into a vector of [8 x float]. 3156 /// 3157 /// \headerfile <x86intrin.h> 3158 /// 3159 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3160 /// 3161 /// \param __p 3162 /// A 32-byte aligned pointer to a memory location containing float values. 3163 /// \returns A 256-bit vector of [8 x float] containing the moved values. 3164 static __inline __m256 __DEFAULT_FN_ATTRS 3165 _mm256_load_ps(float const *__p) 3166 { 3167 return *(__m256 *)__p; 3168 } 3169 3170 /// \brief Loads 4 double-precision floating point values from an unaligned 3171 /// memory location pointed to by \a __p into a vector of [4 x double]. 3172 /// 3173 /// \headerfile <x86intrin.h> 3174 /// 3175 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3176 /// 3177 /// \param __p 3178 /// A pointer to a memory location containing double-precision floating 3179 /// point values. 3180 /// \returns A 256-bit vector of [4 x double] containing the moved values. 3181 static __inline __m256d __DEFAULT_FN_ATTRS 3182 _mm256_loadu_pd(double const *__p) 3183 { 3184 struct __loadu_pd { 3185 __m256d __v; 3186 } __attribute__((__packed__, __may_alias__)); 3187 return ((struct __loadu_pd*)__p)->__v; 3188 } 3189 3190 /// \brief Loads 8 single-precision floating point values from an unaligned 3191 /// memory location pointed to by \a __p into a vector of [8 x float]. 3192 /// 3193 /// \headerfile <x86intrin.h> 3194 /// 3195 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3196 /// 3197 /// \param __p 3198 /// A pointer to a memory location containing single-precision floating 3199 /// point values. 3200 /// \returns A 256-bit vector of [8 x float] containing the moved values. 3201 static __inline __m256 __DEFAULT_FN_ATTRS 3202 _mm256_loadu_ps(float const *__p) 3203 { 3204 struct __loadu_ps { 3205 __m256 __v; 3206 } __attribute__((__packed__, __may_alias__)); 3207 return ((struct __loadu_ps*)__p)->__v; 3208 } 3209 3210 /// \brief Loads 256 bits of integer data from a 32-byte aligned memory 3211 /// location pointed to by \a __p into elements of a 256-bit integer vector. 3212 /// 3213 /// \headerfile <x86intrin.h> 3214 /// 3215 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3216 /// 3217 /// \param __p 3218 /// A 32-byte aligned pointer to a 256-bit integer vector containing integer 3219 /// values. 3220 /// \returns A 256-bit integer vector containing the moved values. 3221 static __inline __m256i __DEFAULT_FN_ATTRS 3222 _mm256_load_si256(__m256i const *__p) 3223 { 3224 return *__p; 3225 } 3226 3227 /// \brief Loads 256 bits of integer data from an unaligned memory location 3228 /// pointed to by \a __p into a 256-bit integer vector. 3229 /// 3230 /// \headerfile <x86intrin.h> 3231 /// 3232 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3233 /// 3234 /// \param __p 3235 /// A pointer to a 256-bit integer vector containing integer values. 3236 /// \returns A 256-bit integer vector containing the moved values. 3237 static __inline __m256i __DEFAULT_FN_ATTRS 3238 _mm256_loadu_si256(__m256i const *__p) 3239 { 3240 struct __loadu_si256 { 3241 __m256i __v; 3242 } __attribute__((__packed__, __may_alias__)); 3243 return ((struct __loadu_si256*)__p)->__v; 3244 } 3245 3246 /// \brief Loads 256 bits of integer data from an unaligned memory location 3247 /// pointed to by \a __p into a 256-bit integer vector. This intrinsic may 3248 /// perform better than \c _mm256_loadu_si256 when the data crosses a cache 3249 /// line boundary. 3250 /// 3251 /// \headerfile <x86intrin.h> 3252 /// 3253 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 3254 /// 3255 /// \param __p 3256 /// A pointer to a 256-bit integer vector containing integer values. 3257 /// \returns A 256-bit integer vector containing the moved values. 3258 static __inline __m256i __DEFAULT_FN_ATTRS 3259 _mm256_lddqu_si256(__m256i const *__p) 3260 { 3261 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 3262 } 3263 3264 /* SIMD store ops */ 3265 /// \brief Stores double-precision floating point values from a 256-bit vector 3266 /// of [4 x double] to a 32-byte aligned memory location pointed to by 3267 /// \a __p. 3268 /// 3269 /// \headerfile <x86intrin.h> 3270 /// 3271 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3272 /// 3273 /// \param __p 3274 /// A 32-byte aligned pointer to a memory location that will receive the 3275 /// double-precision floaing point values. 3276 /// \param __a 3277 /// A 256-bit vector of [4 x double] containing the values to be moved. 3278 static __inline void __DEFAULT_FN_ATTRS 3279 _mm256_store_pd(double *__p, __m256d __a) 3280 { 3281 *(__m256d *)__p = __a; 3282 } 3283 3284 /// \brief Stores single-precision floating point values from a 256-bit vector 3285 /// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. 3286 /// 3287 /// \headerfile <x86intrin.h> 3288 /// 3289 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3290 /// 3291 /// \param __p 3292 /// A 32-byte aligned pointer to a memory location that will receive the 3293 /// float values. 3294 /// \param __a 3295 /// A 256-bit vector of [8 x float] containing the values to be moved. 3296 static __inline void __DEFAULT_FN_ATTRS 3297 _mm256_store_ps(float *__p, __m256 __a) 3298 { 3299 *(__m256 *)__p = __a; 3300 } 3301 3302 /// \brief Stores double-precision floating point values from a 256-bit vector 3303 /// of [4 x double] to an unaligned memory location pointed to by \a __p. 3304 /// 3305 /// \headerfile <x86intrin.h> 3306 /// 3307 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3308 /// 3309 /// \param __p 3310 /// A pointer to a memory location that will receive the double-precision 3311 /// floating point values. 3312 /// \param __a 3313 /// A 256-bit vector of [4 x double] containing the values to be moved. 3314 static __inline void __DEFAULT_FN_ATTRS 3315 _mm256_storeu_pd(double *__p, __m256d __a) 3316 { 3317 struct __storeu_pd { 3318 __m256d __v; 3319 } __attribute__((__packed__, __may_alias__)); 3320 ((struct __storeu_pd*)__p)->__v = __a; 3321 } 3322 3323 /// \brief Stores single-precision floating point values from a 256-bit vector 3324 /// of [8 x float] to an unaligned memory location pointed to by \a __p. 3325 /// 3326 /// \headerfile <x86intrin.h> 3327 /// 3328 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3329 /// 3330 /// \param __p 3331 /// A pointer to a memory location that will receive the float values. 3332 /// \param __a 3333 /// A 256-bit vector of [8 x float] containing the values to be moved. 3334 static __inline void __DEFAULT_FN_ATTRS 3335 _mm256_storeu_ps(float *__p, __m256 __a) 3336 { 3337 struct __storeu_ps { 3338 __m256 __v; 3339 } __attribute__((__packed__, __may_alias__)); 3340 ((struct __storeu_ps*)__p)->__v = __a; 3341 } 3342 3343 /// \brief Stores integer values from a 256-bit integer vector to a 32-byte 3344 /// aligned memory location pointed to by \a __p. 3345 /// 3346 /// \headerfile <x86intrin.h> 3347 /// 3348 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3349 /// 3350 /// \param __p 3351 /// A 32-byte aligned pointer to a memory location that will receive the 3352 /// integer values. 3353 /// \param __a 3354 /// A 256-bit integer vector containing the values to be moved. 3355 static __inline void __DEFAULT_FN_ATTRS 3356 _mm256_store_si256(__m256i *__p, __m256i __a) 3357 { 3358 *__p = __a; 3359 } 3360 3361 /// \brief Stores integer values from a 256-bit integer vector to an unaligned 3362 /// memory location pointed to by \a __p. 3363 /// 3364 /// \headerfile <x86intrin.h> 3365 /// 3366 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3367 /// 3368 /// \param __p 3369 /// A pointer to a memory location that will receive the integer values. 3370 /// \param __a 3371 /// A 256-bit integer vector containing the values to be moved. 3372 static __inline void __DEFAULT_FN_ATTRS 3373 _mm256_storeu_si256(__m256i *__p, __m256i __a) 3374 { 3375 struct __storeu_si256 { 3376 __m256i __v; 3377 } __attribute__((__packed__, __may_alias__)); 3378 ((struct __storeu_si256*)__p)->__v = __a; 3379 } 3380 3381 /* Conditional load ops */ 3382 /// \brief Conditionally loads double-precision floating point elements from a 3383 /// memory location pointed to by \a __p into a 128-bit vector of 3384 /// [2 x double], depending on the mask bits associated with each data 3385 /// element. 3386 /// 3387 /// \headerfile <x86intrin.h> 3388 /// 3389 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3390 /// 3391 /// \param __p 3392 /// A pointer to a memory location that contains the double-precision 3393 /// floating point values. 3394 /// \param __m 3395 /// A 128-bit integer vector containing the mask. The most significant bit of 3396 /// each data element represents the mask bits. If a mask bit is zero, the 3397 /// corresponding value in the memory location is not loaded and the 3398 /// corresponding field in the return value is set to zero. 3399 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 3400 static __inline __m128d __DEFAULT_FN_ATTRS 3401 _mm_maskload_pd(double const *__p, __m128i __m) 3402 { 3403 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 3404 } 3405 3406 /// \brief Conditionally loads double-precision floating point elements from a 3407 /// memory location pointed to by \a __p into a 256-bit vector of 3408 /// [4 x double], depending on the mask bits associated with each data 3409 /// element. 3410 /// 3411 /// \headerfile <x86intrin.h> 3412 /// 3413 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3414 /// 3415 /// \param __p 3416 /// A pointer to a memory location that contains the double-precision 3417 /// floating point values. 3418 /// \param __m 3419 /// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3420 /// significant bit of each quadword element represents the mask bits. If a 3421 /// mask bit is zero, the corresponding value in the memory location is not 3422 /// loaded and the corresponding field in the return value is set to zero. 3423 /// \returns A 256-bit vector of [4 x double] containing the loaded values. 3424 static __inline __m256d __DEFAULT_FN_ATTRS 3425 _mm256_maskload_pd(double const *__p, __m256i __m) 3426 { 3427 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 3428 (__v4di)__m); 3429 } 3430 3431 /// \brief Conditionally loads single-precision floating point elements from a 3432 /// memory location pointed to by \a __p into a 128-bit vector of 3433 /// [4 x float], depending on the mask bits associated with each data 3434 /// element. 3435 /// 3436 /// \headerfile <x86intrin.h> 3437 /// 3438 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3439 /// 3440 /// \param __p 3441 /// A pointer to a memory location that contains the single-precision 3442 /// floating point values. 3443 /// \param __m 3444 /// A 128-bit integer vector containing the mask. The most significant bit of 3445 /// each data element represents the mask bits. If a mask bit is zero, the 3446 /// corresponding value in the memory location is not loaded and the 3447 /// corresponding field in the return value is set to zero. 3448 /// \returns A 128-bit vector of [4 x float] containing the loaded values. 3449 static __inline __m128 __DEFAULT_FN_ATTRS 3450 _mm_maskload_ps(float const *__p, __m128i __m) 3451 { 3452 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 3453 } 3454 3455 /// \brief Conditionally loads single-precision floating point elements from a 3456 /// memory location pointed to by \a __p into a 256-bit vector of 3457 /// [8 x float], depending on the mask bits associated with each data 3458 /// element. 3459 /// 3460 /// \headerfile <x86intrin.h> 3461 /// 3462 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3463 /// 3464 /// \param __p 3465 /// A pointer to a memory location that contains the single-precision 3466 /// floating point values. 3467 /// \param __m 3468 /// A 256-bit integer vector of [8 x dword] containing the mask. The most 3469 /// significant bit of each dword element represents the mask bits. If a mask 3470 /// bit is zero, the corresponding value in the memory location is not loaded 3471 /// and the corresponding field in the return value is set to zero. 3472 /// \returns A 256-bit vector of [8 x float] containing the loaded values. 3473 static __inline __m256 __DEFAULT_FN_ATTRS 3474 _mm256_maskload_ps(float const *__p, __m256i __m) 3475 { 3476 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 3477 } 3478 3479 /* Conditional store ops */ 3480 /// \brief Moves single-precision floating point values from a 256-bit vector 3481 /// of [8 x float] to a memory location pointed to by \a __p, according to 3482 /// the specified mask. 3483 /// 3484 /// \headerfile <x86intrin.h> 3485 /// 3486 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3487 /// 3488 /// \param __p 3489 /// A pointer to a memory location that will receive the float values. 3490 /// \param __m 3491 /// A 256-bit integer vector of [8 x dword] containing the mask. The most 3492 /// significant bit of each dword element in the mask vector represents the 3493 /// mask bits. If a mask bit is zero, the corresponding value from vector 3494 /// \a __a is not stored and the corresponding field in the memory location 3495 /// pointed to by \a __p is not changed. 3496 /// \param __a 3497 /// A 256-bit vector of [8 x float] containing the values to be stored. 3498 static __inline void __DEFAULT_FN_ATTRS 3499 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 3500 { 3501 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 3502 } 3503 3504 /// \brief Moves double-precision values from a 128-bit vector of [2 x double] 3505 /// to a memory location pointed to by \a __p, according to the specified 3506 /// mask. 3507 /// 3508 /// \headerfile <x86intrin.h> 3509 /// 3510 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3511 /// 3512 /// \param __p 3513 /// A pointer to a memory location that will receive the float values. 3514 /// \param __m 3515 /// A 128-bit integer vector containing the mask. The most significant bit of 3516 /// each field in the mask vector represents the mask bits. If a mask bit is 3517 /// zero, the corresponding value from vector \a __a is not stored and the 3518 /// corresponding field in the memory location pointed to by \a __p is not 3519 /// changed. 3520 /// \param __a 3521 /// A 128-bit vector of [2 x double] containing the values to be stored. 3522 static __inline void __DEFAULT_FN_ATTRS 3523 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 3524 { 3525 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 3526 } 3527 3528 /// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3529 /// to a memory location pointed to by \a __p, according to the specified 3530 /// mask. 3531 /// 3532 /// \headerfile <x86intrin.h> 3533 /// 3534 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3535 /// 3536 /// \param __p 3537 /// A pointer to a memory location that will receive the float values. 3538 /// \param __m 3539 /// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3540 /// significant bit of each quadword element in the mask vector represents 3541 /// the mask bits. If a mask bit is zero, the corresponding value from vector 3542 /// __a is not stored and the corresponding field in the memory location 3543 /// pointed to by \a __p is not changed. 3544 /// \param __a 3545 /// A 256-bit vector of [4 x double] containing the values to be stored. 3546 static __inline void __DEFAULT_FN_ATTRS 3547 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 3548 { 3549 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 3550 } 3551 3552 /// \brief Moves single-precision floating point values from a 128-bit vector 3553 /// of [4 x float] to a memory location pointed to by \a __p, according to 3554 /// the specified mask. 3555 /// 3556 /// \headerfile <x86intrin.h> 3557 /// 3558 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3559 /// 3560 /// \param __p 3561 /// A pointer to a memory location that will receive the float values. 3562 /// \param __m 3563 /// A 128-bit integer vector containing the mask. The most significant bit of 3564 /// each field in the mask vector represents the mask bits. If a mask bit is 3565 /// zero, the corresponding value from vector __a is not stored and the 3566 /// corresponding field in the memory location pointed to by \a __p is not 3567 /// changed. 3568 /// \param __a 3569 /// A 128-bit vector of [4 x float] containing the values to be stored. 3570 static __inline void __DEFAULT_FN_ATTRS 3571 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 3572 { 3573 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 3574 } 3575 3576 /* Cacheability support ops */ 3577 /// \brief Moves integer data from a 256-bit integer vector to a 32-byte 3578 /// aligned memory location. To minimize caching, the data is flagged as 3579 /// non-temporal (unlikely to be used again soon). 3580 /// 3581 /// \headerfile <x86intrin.h> 3582 /// 3583 /// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction. 3584 /// 3585 /// \param __a 3586 /// A pointer to a 32-byte aligned memory location that will receive the 3587 /// integer values. 3588 /// \param __b 3589 /// A 256-bit integer vector containing the values to be moved. 3590 static __inline void __DEFAULT_FN_ATTRS 3591 _mm256_stream_si256(__m256i *__a, __m256i __b) 3592 { 3593 __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a); 3594 } 3595 3596 /// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3597 /// to a 32-byte aligned memory location. To minimize caching, the data is 3598 /// flagged as non-temporal (unlikely to be used again soon). 3599 /// 3600 /// \headerfile <x86intrin.h> 3601 /// 3602 /// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction. 3603 /// 3604 /// \param __a 3605 /// A pointer to a 32-byte aligned memory location that will receive the 3606 /// double-precision floating-point values. 3607 /// \param __b 3608 /// A 256-bit vector of [4 x double] containing the values to be moved. 3609 static __inline void __DEFAULT_FN_ATTRS 3610 _mm256_stream_pd(double *__a, __m256d __b) 3611 { 3612 __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a); 3613 } 3614 3615 /// \brief Moves single-precision floating point values from a 256-bit vector 3616 /// of [8 x float] to a 32-byte aligned memory location. To minimize 3617 /// caching, the data is flagged as non-temporal (unlikely to be used again 3618 /// soon). 3619 /// 3620 /// \headerfile <x86intrin.h> 3621 /// 3622 /// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction. 3623 /// 3624 /// \param __p 3625 /// A pointer to a 32-byte aligned memory location that will receive the 3626 /// single-precision floating point values. 3627 /// \param __a 3628 /// A 256-bit vector of [8 x float] containing the values to be moved. 3629 static __inline void __DEFAULT_FN_ATTRS 3630 _mm256_stream_ps(float *__p, __m256 __a) 3631 { 3632 __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p); 3633 } 3634 3635 /* Create vectors */ 3636 /// \brief Create a 256-bit vector of [4 x double] with undefined values. 3637 /// 3638 /// \headerfile <x86intrin.h> 3639 /// 3640 /// This intrinsic has no corresponding instruction. 3641 /// 3642 /// \returns A 256-bit vector of [4 x double] containing undefined values. 3643 static __inline__ __m256d __DEFAULT_FN_ATTRS 3644 _mm256_undefined_pd(void) 3645 { 3646 return (__m256d)__builtin_ia32_undef256(); 3647 } 3648 3649 /// \brief Create a 256-bit vector of [8 x float] with undefined values. 3650 /// 3651 /// \headerfile <x86intrin.h> 3652 /// 3653 /// This intrinsic has no corresponding instruction. 3654 /// 3655 /// \returns A 256-bit vector of [8 x float] containing undefined values. 3656 static __inline__ __m256 __DEFAULT_FN_ATTRS 3657 _mm256_undefined_ps(void) 3658 { 3659 return (__m256)__builtin_ia32_undef256(); 3660 } 3661 3662 /// \brief Create a 256-bit integer vector with undefined values. 3663 /// 3664 /// \headerfile <x86intrin.h> 3665 /// 3666 /// This intrinsic has no corresponding instruction. 3667 /// 3668 /// \returns A 256-bit integer vector containing undefined values. 3669 static __inline__ __m256i __DEFAULT_FN_ATTRS 3670 _mm256_undefined_si256(void) 3671 { 3672 return (__m256i)__builtin_ia32_undef256(); 3673 } 3674 3675 /// \brief Constructs a 256-bit floating-point vector of [4 x double] 3676 /// initialized with the specified double-precision floating-point values. 3677 /// 3678 /// \headerfile <x86intrin.h> 3679 /// 3680 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3681 /// instruction. 3682 /// 3683 /// \param __a 3684 /// A double-precision floating-point value used to initialize bits [255:192] 3685 /// of the result. 3686 /// \param __b 3687 /// A double-precision floating-point value used to initialize bits [191:128] 3688 /// of the result. 3689 /// \param __c 3690 /// A double-precision floating-point value used to initialize bits [127:64] 3691 /// of the result. 3692 /// \param __d 3693 /// A double-precision floating-point value used to initialize bits [63:0] 3694 /// of the result. 3695 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 3696 static __inline __m256d __DEFAULT_FN_ATTRS 3697 _mm256_set_pd(double __a, double __b, double __c, double __d) 3698 { 3699 return (__m256d){ __d, __c, __b, __a }; 3700 } 3701 3702 /// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized 3703 /// with the specified single-precision floating-point values. 3704 /// 3705 /// \headerfile <x86intrin.h> 3706 /// 3707 /// This intrinsic is a utility function and does not correspond to a specific 3708 /// instruction. 3709 /// 3710 /// \param __a 3711 /// A single-precision floating-point value used to initialize bits [255:224] 3712 /// of the result. 3713 /// \param __b 3714 /// A single-precision floating-point value used to initialize bits [223:192] 3715 /// of the result. 3716 /// \param __c 3717 /// A single-precision floating-point value used to initialize bits [191:160] 3718 /// of the result. 3719 /// \param __d 3720 /// A single-precision floating-point value used to initialize bits [159:128] 3721 /// of the result. 3722 /// \param __e 3723 /// A single-precision floating-point value used to initialize bits [127:96] 3724 /// of the result. 3725 /// \param __f 3726 /// A single-precision floating-point value used to initialize bits [95:64] 3727 /// of the result. 3728 /// \param __g 3729 /// A single-precision floating-point value used to initialize bits [63:32] 3730 /// of the result. 3731 /// \param __h 3732 /// A single-precision floating-point value used to initialize bits [31:0] 3733 /// of the result. 3734 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 3735 static __inline __m256 __DEFAULT_FN_ATTRS 3736 _mm256_set_ps(float __a, float __b, float __c, float __d, 3737 float __e, float __f, float __g, float __h) 3738 { 3739 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 3740 } 3741 3742 /// \brief Constructs a 256-bit integer vector initialized with the specified 3743 /// 32-bit integral values. 3744 /// 3745 /// \headerfile <x86intrin.h> 3746 /// 3747 /// This intrinsic is a utility function and does not correspond to a specific 3748 /// instruction. 3749 /// 3750 /// \param __i0 3751 /// A 32-bit integral value used to initialize bits [255:224] of the result. 3752 /// \param __i1 3753 /// A 32-bit integral value used to initialize bits [223:192] of the result. 3754 /// \param __i2 3755 /// A 32-bit integral value used to initialize bits [191:160] of the result. 3756 /// \param __i3 3757 /// A 32-bit integral value used to initialize bits [159:128] of the result. 3758 /// \param __i4 3759 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3760 /// \param __i5 3761 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3762 /// \param __i6 3763 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3764 /// \param __i7 3765 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3766 /// \returns An initialized 256-bit integer vector. 3767 static __inline __m256i __DEFAULT_FN_ATTRS 3768 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 3769 int __i4, int __i5, int __i6, int __i7) 3770 { 3771 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 3772 } 3773 3774 /// \brief Constructs a 256-bit integer vector initialized with the specified 3775 /// 16-bit integral values. 3776 /// 3777 /// \headerfile <x86intrin.h> 3778 /// 3779 /// This intrinsic is a utility function and does not correspond to a specific 3780 /// instruction. 3781 /// 3782 /// \param __w15 3783 /// A 16-bit integral value used to initialize bits [255:240] of the result. 3784 /// \param __w14 3785 /// A 16-bit integral value used to initialize bits [239:224] of the result. 3786 /// \param __w13 3787 /// A 16-bit integral value used to initialize bits [223:208] of the result. 3788 /// \param __w12 3789 /// A 16-bit integral value used to initialize bits [207:192] of the result. 3790 /// \param __w11 3791 /// A 16-bit integral value used to initialize bits [191:176] of the result. 3792 /// \param __w10 3793 /// A 16-bit integral value used to initialize bits [175:160] of the result. 3794 /// \param __w09 3795 /// A 16-bit integral value used to initialize bits [159:144] of the result. 3796 /// \param __w08 3797 /// A 16-bit integral value used to initialize bits [143:128] of the result. 3798 /// \param __w07 3799 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3800 /// \param __w06 3801 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3802 /// \param __w05 3803 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3804 /// \param __w04 3805 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3806 /// \param __w03 3807 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3808 /// \param __w02 3809 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3810 /// \param __w01 3811 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3812 /// \param __w00 3813 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3814 /// \returns An initialized 256-bit integer vector. 3815 static __inline __m256i __DEFAULT_FN_ATTRS 3816 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 3817 short __w11, short __w10, short __w09, short __w08, 3818 short __w07, short __w06, short __w05, short __w04, 3819 short __w03, short __w02, short __w01, short __w00) 3820 { 3821 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 3822 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 3823 } 3824 3825 /// \brief Constructs a 256-bit integer vector initialized with the specified 3826 /// 8-bit integral values. 3827 /// 3828 /// \headerfile <x86intrin.h> 3829 /// 3830 /// This intrinsic is a utility function and does not correspond to a specific 3831 /// instruction. 3832 /// 3833 /// \param __b31 3834 /// An 8-bit integral value used to initialize bits [255:248] of the result. 3835 /// \param __b30 3836 /// An 8-bit integral value used to initialize bits [247:240] of the result. 3837 /// \param __b29 3838 /// An 8-bit integral value used to initialize bits [239:232] of the result. 3839 /// \param __b28 3840 /// An 8-bit integral value used to initialize bits [231:224] of the result. 3841 /// \param __b27 3842 /// An 8-bit integral value used to initialize bits [223:216] of the result. 3843 /// \param __b26 3844 /// An 8-bit integral value used to initialize bits [215:208] of the result. 3845 /// \param __b25 3846 /// An 8-bit integral value used to initialize bits [207:200] of the result. 3847 /// \param __b24 3848 /// An 8-bit integral value used to initialize bits [199:192] of the result. 3849 /// \param __b23 3850 /// An 8-bit integral value used to initialize bits [191:184] of the result. 3851 /// \param __b22 3852 /// An 8-bit integral value used to initialize bits [183:176] of the result. 3853 /// \param __b21 3854 /// An 8-bit integral value used to initialize bits [175:168] of the result. 3855 /// \param __b20 3856 /// An 8-bit integral value used to initialize bits [167:160] of the result. 3857 /// \param __b19 3858 /// An 8-bit integral value used to initialize bits [159:152] of the result. 3859 /// \param __b18 3860 /// An 8-bit integral value used to initialize bits [151:144] of the result. 3861 /// \param __b17 3862 /// An 8-bit integral value used to initialize bits [143:136] of the result. 3863 /// \param __b16 3864 /// An 8-bit integral value used to initialize bits [135:128] of the result. 3865 /// \param __b15 3866 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3867 /// \param __b14 3868 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3869 /// \param __b13 3870 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3871 /// \param __b12 3872 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3873 /// \param __b11 3874 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3875 /// \param __b10 3876 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3877 /// \param __b09 3878 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3879 /// \param __b08 3880 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3881 /// \param __b07 3882 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3883 /// \param __b06 3884 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3885 /// \param __b05 3886 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3887 /// \param __b04 3888 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3889 /// \param __b03 3890 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3891 /// \param __b02 3892 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3893 /// \param __b01 3894 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3895 /// \param __b00 3896 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3897 /// \returns An initialized 256-bit integer vector. 3898 static __inline __m256i __DEFAULT_FN_ATTRS 3899 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 3900 char __b27, char __b26, char __b25, char __b24, 3901 char __b23, char __b22, char __b21, char __b20, 3902 char __b19, char __b18, char __b17, char __b16, 3903 char __b15, char __b14, char __b13, char __b12, 3904 char __b11, char __b10, char __b09, char __b08, 3905 char __b07, char __b06, char __b05, char __b04, 3906 char __b03, char __b02, char __b01, char __b00) 3907 { 3908 return (__m256i)(__v32qi){ 3909 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 3910 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 3911 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 3912 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 3913 }; 3914 } 3915 3916 /// \brief Constructs a 256-bit integer vector initialized with the specified 3917 /// 64-bit integral values. 3918 /// 3919 /// \headerfile <x86intrin.h> 3920 /// 3921 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 3922 /// instruction. 3923 /// 3924 /// \param __a 3925 /// A 64-bit integral value used to initialize bits [255:192] of the result. 3926 /// \param __b 3927 /// A 64-bit integral value used to initialize bits [191:128] of the result. 3928 /// \param __c 3929 /// A 64-bit integral value used to initialize bits [127:64] of the result. 3930 /// \param __d 3931 /// A 64-bit integral value used to initialize bits [63:0] of the result. 3932 /// \returns An initialized 256-bit integer vector. 3933 static __inline __m256i __DEFAULT_FN_ATTRS 3934 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 3935 { 3936 return (__m256i)(__v4di){ __d, __c, __b, __a }; 3937 } 3938 3939 /* Create vectors with elements in reverse order */ 3940 /// \brief Constructs a 256-bit floating-point vector of [4 x double], 3941 /// initialized in reverse order with the specified double-precision 3942 /// floating-point values. 3943 /// 3944 /// \headerfile <x86intrin.h> 3945 /// 3946 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3947 /// instruction. 3948 /// 3949 /// \param __a 3950 /// A double-precision floating-point value used to initialize bits [63:0] 3951 /// of the result. 3952 /// \param __b 3953 /// A double-precision floating-point value used to initialize bits [127:64] 3954 /// of the result. 3955 /// \param __c 3956 /// A double-precision floating-point value used to initialize bits [191:128] 3957 /// of the result. 3958 /// \param __d 3959 /// A double-precision floating-point value used to initialize bits [255:192] 3960 /// of the result. 3961 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 3962 static __inline __m256d __DEFAULT_FN_ATTRS 3963 _mm256_setr_pd(double __a, double __b, double __c, double __d) 3964 { 3965 return (__m256d){ __a, __b, __c, __d }; 3966 } 3967 3968 /// \brief Constructs a 256-bit floating-point vector of [8 x float], 3969 /// initialized in reverse order with the specified single-precision 3970 /// float-point values. 3971 /// 3972 /// \headerfile <x86intrin.h> 3973 /// 3974 /// This intrinsic is a utility function and does not correspond to a specific 3975 /// instruction. 3976 /// 3977 /// \param __a 3978 /// A single-precision floating-point value used to initialize bits [31:0] 3979 /// of the result. 3980 /// \param __b 3981 /// A single-precision floating-point value used to initialize bits [63:32] 3982 /// of the result. 3983 /// \param __c 3984 /// A single-precision floating-point value used to initialize bits [95:64] 3985 /// of the result. 3986 /// \param __d 3987 /// A single-precision floating-point value used to initialize bits [127:96] 3988 /// of the result. 3989 /// \param __e 3990 /// A single-precision floating-point value used to initialize bits [159:128] 3991 /// of the result. 3992 /// \param __f 3993 /// A single-precision floating-point value used to initialize bits [191:160] 3994 /// of the result. 3995 /// \param __g 3996 /// A single-precision floating-point value used to initialize bits [223:192] 3997 /// of the result. 3998 /// \param __h 3999 /// A single-precision floating-point value used to initialize bits [255:224] 4000 /// of the result. 4001 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 4002 static __inline __m256 __DEFAULT_FN_ATTRS 4003 _mm256_setr_ps(float __a, float __b, float __c, float __d, 4004 float __e, float __f, float __g, float __h) 4005 { 4006 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; 4007 } 4008 4009 /// \brief Constructs a 256-bit integer vector, initialized in reverse order 4010 /// with the specified 32-bit integral values. 4011 /// 4012 /// \headerfile <x86intrin.h> 4013 /// 4014 /// This intrinsic is a utility function and does not correspond to a specific 4015 /// instruction. 4016 /// 4017 /// \param __i0 4018 /// A 32-bit integral value used to initialize bits [31:0] of the result. 4019 /// \param __i1 4020 /// A 32-bit integral value used to initialize bits [63:32] of the result. 4021 /// \param __i2 4022 /// A 32-bit integral value used to initialize bits [95:64] of the result. 4023 /// \param __i3 4024 /// A 32-bit integral value used to initialize bits [127:96] of the result. 4025 /// \param __i4 4026 /// A 32-bit integral value used to initialize bits [159:128] of the result. 4027 /// \param __i5 4028 /// A 32-bit integral value used to initialize bits [191:160] of the result. 4029 /// \param __i6 4030 /// A 32-bit integral value used to initialize bits [223:192] of the result. 4031 /// \param __i7 4032 /// A 32-bit integral value used to initialize bits [255:224] of the result. 4033 /// \returns An initialized 256-bit integer vector. 4034 static __inline __m256i __DEFAULT_FN_ATTRS 4035 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 4036 int __i4, int __i5, int __i6, int __i7) 4037 { 4038 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; 4039 } 4040 4041 /// \brief Constructs a 256-bit integer vector, initialized in reverse order 4042 /// with the specified 16-bit integral values. 4043 /// 4044 /// \headerfile <x86intrin.h> 4045 /// 4046 /// This intrinsic is a utility function and does not correspond to a specific 4047 /// instruction. 4048 /// 4049 /// \param __w15 4050 /// A 16-bit integral value used to initialize bits [15:0] of the result. 4051 /// \param __w14 4052 /// A 16-bit integral value used to initialize bits [31:16] of the result. 4053 /// \param __w13 4054 /// A 16-bit integral value used to initialize bits [47:32] of the result. 4055 /// \param __w12 4056 /// A 16-bit integral value used to initialize bits [63:48] of the result. 4057 /// \param __w11 4058 /// A 16-bit integral value used to initialize bits [79:64] of the result. 4059 /// \param __w10 4060 /// A 16-bit integral value used to initialize bits [95:80] of the result. 4061 /// \param __w09 4062 /// A 16-bit integral value used to initialize bits [111:96] of the result. 4063 /// \param __w08 4064 /// A 16-bit integral value used to initialize bits [127:112] of the result. 4065 /// \param __w07 4066 /// A 16-bit integral value used to initialize bits [143:128] of the result. 4067 /// \param __w06 4068 /// A 16-bit integral value used to initialize bits [159:144] of the result. 4069 /// \param __w05 4070 /// A 16-bit integral value used to initialize bits [175:160] of the result. 4071 /// \param __w04 4072 /// A 16-bit integral value used to initialize bits [191:176] of the result. 4073 /// \param __w03 4074 /// A 16-bit integral value used to initialize bits [207:192] of the result. 4075 /// \param __w02 4076 /// A 16-bit integral value used to initialize bits [223:208] of the result. 4077 /// \param __w01 4078 /// A 16-bit integral value used to initialize bits [239:224] of the result. 4079 /// \param __w00 4080 /// A 16-bit integral value used to initialize bits [255:240] of the result. 4081 /// \returns An initialized 256-bit integer vector. 4082 static __inline __m256i __DEFAULT_FN_ATTRS 4083 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 4084 short __w11, short __w10, short __w09, short __w08, 4085 short __w07, short __w06, short __w05, short __w04, 4086 short __w03, short __w02, short __w01, short __w00) 4087 { 4088 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, 4089 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; 4090 } 4091 4092 /// \brief Constructs a 256-bit integer vector, initialized in reverse order 4093 /// with the specified 8-bit integral values. 4094 /// 4095 /// \headerfile <x86intrin.h> 4096 /// 4097 /// This intrinsic is a utility function and does not correspond to a specific 4098 /// instruction. 4099 /// 4100 /// \param __b31 4101 /// An 8-bit integral value used to initialize bits [7:0] of the result. 4102 /// \param __b30 4103 /// An 8-bit integral value used to initialize bits [15:8] of the result. 4104 /// \param __b29 4105 /// An 8-bit integral value used to initialize bits [23:16] of the result. 4106 /// \param __b28 4107 /// An 8-bit integral value used to initialize bits [31:24] of the result. 4108 /// \param __b27 4109 /// An 8-bit integral value used to initialize bits [39:32] of the result. 4110 /// \param __b26 4111 /// An 8-bit integral value used to initialize bits [47:40] of the result. 4112 /// \param __b25 4113 /// An 8-bit integral value used to initialize bits [55:48] of the result. 4114 /// \param __b24 4115 /// An 8-bit integral value used to initialize bits [63:56] of the result. 4116 /// \param __b23 4117 /// An 8-bit integral value used to initialize bits [71:64] of the result. 4118 /// \param __b22 4119 /// An 8-bit integral value used to initialize bits [79:72] of the result. 4120 /// \param __b21 4121 /// An 8-bit integral value used to initialize bits [87:80] of the result. 4122 /// \param __b20 4123 /// An 8-bit integral value used to initialize bits [95:88] of the result. 4124 /// \param __b19 4125 /// An 8-bit integral value used to initialize bits [103:96] of the result. 4126 /// \param __b18 4127 /// An 8-bit integral value used to initialize bits [111:104] of the result. 4128 /// \param __b17 4129 /// An 8-bit integral value used to initialize bits [119:112] of the result. 4130 /// \param __b16 4131 /// An 8-bit integral value used to initialize bits [127:120] of the result. 4132 /// \param __b15 4133 /// An 8-bit integral value used to initialize bits [135:128] of the result. 4134 /// \param __b14 4135 /// An 8-bit integral value used to initialize bits [143:136] of the result. 4136 /// \param __b13 4137 /// An 8-bit integral value used to initialize bits [151:144] of the result. 4138 /// \param __b12 4139 /// An 8-bit integral value used to initialize bits [159:152] of the result. 4140 /// \param __b11 4141 /// An 8-bit integral value used to initialize bits [167:160] of the result. 4142 /// \param __b10 4143 /// An 8-bit integral value used to initialize bits [175:168] of the result. 4144 /// \param __b09 4145 /// An 8-bit integral value used to initialize bits [183:176] of the result. 4146 /// \param __b08 4147 /// An 8-bit integral value used to initialize bits [191:184] of the result. 4148 /// \param __b07 4149 /// An 8-bit integral value used to initialize bits [199:192] of the result. 4150 /// \param __b06 4151 /// An 8-bit integral value used to initialize bits [207:200] of the result. 4152 /// \param __b05 4153 /// An 8-bit integral value used to initialize bits [215:208] of the result. 4154 /// \param __b04 4155 /// An 8-bit integral value used to initialize bits [223:216] of the result. 4156 /// \param __b03 4157 /// An 8-bit integral value used to initialize bits [231:224] of the result. 4158 /// \param __b02 4159 /// An 8-bit integral value used to initialize bits [239:232] of the result. 4160 /// \param __b01 4161 /// An 8-bit integral value used to initialize bits [247:240] of the result. 4162 /// \param __b00 4163 /// An 8-bit integral value used to initialize bits [255:248] of the result. 4164 /// \returns An initialized 256-bit integer vector. 4165 static __inline __m256i __DEFAULT_FN_ATTRS 4166 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 4167 char __b27, char __b26, char __b25, char __b24, 4168 char __b23, char __b22, char __b21, char __b20, 4169 char __b19, char __b18, char __b17, char __b16, 4170 char __b15, char __b14, char __b13, char __b12, 4171 char __b11, char __b10, char __b09, char __b08, 4172 char __b07, char __b06, char __b05, char __b04, 4173 char __b03, char __b02, char __b01, char __b00) 4174 { 4175 return (__m256i)(__v32qi){ 4176 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, 4177 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, 4178 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, 4179 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; 4180 } 4181 4182 /// \brief Constructs a 256-bit integer vector, initialized in reverse order 4183 /// with the specified 64-bit integral values. 4184 /// 4185 /// \headerfile <x86intrin.h> 4186 /// 4187 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 4188 /// instruction. 4189 /// 4190 /// \param __a 4191 /// A 64-bit integral value used to initialize bits [63:0] of the result. 4192 /// \param __b 4193 /// A 64-bit integral value used to initialize bits [127:64] of the result. 4194 /// \param __c 4195 /// A 64-bit integral value used to initialize bits [191:128] of the result. 4196 /// \param __d 4197 /// A 64-bit integral value used to initialize bits [255:192] of the result. 4198 /// \returns An initialized 256-bit integer vector. 4199 static __inline __m256i __DEFAULT_FN_ATTRS 4200 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 4201 { 4202 return (__m256i)(__v4di){ __a, __b, __c, __d }; 4203 } 4204 4205 /* Create vectors with repeated elements */ 4206 /// \brief Constructs a 256-bit floating-point vector of [4 x double], with each 4207 /// of the four double-precision floating-point vector elements set to the 4208 /// specified double-precision floating-point value. 4209 /// 4210 /// \headerfile <x86intrin.h> 4211 /// 4212 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4213 /// 4214 /// \param __w 4215 /// A double-precision floating-point value used to initialize each vector 4216 /// element of the result. 4217 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 4218 static __inline __m256d __DEFAULT_FN_ATTRS 4219 _mm256_set1_pd(double __w) 4220 { 4221 return (__m256d){ __w, __w, __w, __w }; 4222 } 4223 4224 /// \brief Constructs a 256-bit floating-point vector of [8 x float], with each 4225 /// of the eight single-precision floating-point vector elements set to the 4226 /// specified single-precision floating-point value. 4227 /// 4228 /// \headerfile <x86intrin.h> 4229 /// 4230 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4231 /// instruction. 4232 /// 4233 /// \param __w 4234 /// A single-precision floating-point value used to initialize each vector 4235 /// element of the result. 4236 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 4237 static __inline __m256 __DEFAULT_FN_ATTRS 4238 _mm256_set1_ps(float __w) 4239 { 4240 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; 4241 } 4242 4243 /// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the 4244 /// 32-bit integral vector elements set to the specified 32-bit integral 4245 /// value. 4246 /// 4247 /// \headerfile <x86intrin.h> 4248 /// 4249 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4250 /// instruction. 4251 /// 4252 /// \param __i 4253 /// A 32-bit integral value used to initialize each vector element of the 4254 /// result. 4255 /// \returns An initialized 256-bit integer vector of [8 x i32]. 4256 static __inline __m256i __DEFAULT_FN_ATTRS 4257 _mm256_set1_epi32(int __i) 4258 { 4259 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; 4260 } 4261 4262 /// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the 4263 /// 16-bit integral vector elements set to the specified 16-bit integral 4264 /// value. 4265 /// 4266 /// \headerfile <x86intrin.h> 4267 /// 4268 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4269 /// 4270 /// \param __w 4271 /// A 16-bit integral value used to initialize each vector element of the 4272 /// result. 4273 /// \returns An initialized 256-bit integer vector of [16 x i16]. 4274 static __inline __m256i __DEFAULT_FN_ATTRS 4275 _mm256_set1_epi16(short __w) 4276 { 4277 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, 4278 __w, __w, __w, __w, __w, __w }; 4279 } 4280 4281 /// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the 4282 /// 8-bit integral vector elements set to the specified 8-bit integral value. 4283 /// 4284 /// \headerfile <x86intrin.h> 4285 /// 4286 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4287 /// 4288 /// \param __b 4289 /// An 8-bit integral value used to initialize each vector element of the 4290 /// result. 4291 /// \returns An initialized 256-bit integer vector of [32 x i8]. 4292 static __inline __m256i __DEFAULT_FN_ATTRS 4293 _mm256_set1_epi8(char __b) 4294 { 4295 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4296 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4297 __b, __b, __b, __b, __b, __b, __b }; 4298 } 4299 4300 /// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the 4301 /// 64-bit integral vector elements set to the specified 64-bit integral 4302 /// value. 4303 /// 4304 /// \headerfile <x86intrin.h> 4305 /// 4306 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4307 /// 4308 /// \param __q 4309 /// A 64-bit integral value used to initialize each vector element of the 4310 /// result. 4311 /// \returns An initialized 256-bit integer vector of [4 x i64]. 4312 static __inline __m256i __DEFAULT_FN_ATTRS 4313 _mm256_set1_epi64x(long long __q) 4314 { 4315 return (__m256i)(__v4di){ __q, __q, __q, __q }; 4316 } 4317 4318 /* Create __zeroed vectors */ 4319 /// \brief Constructs a 256-bit floating-point vector of [4 x double] with all 4320 /// vector elements initialized to zero. 4321 /// 4322 /// \headerfile <x86intrin.h> 4323 /// 4324 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4325 /// 4326 /// \returns A 256-bit vector of [4 x double] with all elements set to zero. 4327 static __inline __m256d __DEFAULT_FN_ATTRS 4328 _mm256_setzero_pd(void) 4329 { 4330 return (__m256d){ 0, 0, 0, 0 }; 4331 } 4332 4333 /// \brief Constructs a 256-bit floating-point vector of [8 x float] with all 4334 /// vector elements initialized to zero. 4335 /// 4336 /// \headerfile <x86intrin.h> 4337 /// 4338 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4339 /// 4340 /// \returns A 256-bit vector of [8 x float] with all elements set to zero. 4341 static __inline __m256 __DEFAULT_FN_ATTRS 4342 _mm256_setzero_ps(void) 4343 { 4344 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 4345 } 4346 4347 /// \brief Constructs a 256-bit integer vector initialized to zero. 4348 /// 4349 /// \headerfile <x86intrin.h> 4350 /// 4351 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4352 /// 4353 /// \returns A 256-bit integer vector initialized to zero. 4354 static __inline __m256i __DEFAULT_FN_ATTRS 4355 _mm256_setzero_si256(void) 4356 { 4357 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 4358 } 4359 4360 /* Cast between vector types */ 4361 /// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4362 /// floating-point vector of [8 x float]. 4363 /// 4364 /// \headerfile <x86intrin.h> 4365 /// 4366 /// This intrinsic has no corresponding instruction. 4367 /// 4368 /// \param __a 4369 /// A 256-bit floating-point vector of [4 x double]. 4370 /// \returns A 256-bit floating-point vector of [8 x float] containing the same 4371 /// bitwise pattern as the parameter. 4372 static __inline __m256 __DEFAULT_FN_ATTRS 4373 _mm256_castpd_ps(__m256d __a) 4374 { 4375 return (__m256)__a; 4376 } 4377 4378 /// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4379 /// integer vector. 4380 /// 4381 /// \headerfile <x86intrin.h> 4382 /// 4383 /// This intrinsic has no corresponding instruction. 4384 /// 4385 /// \param __a 4386 /// A 256-bit floating-point vector of [4 x double]. 4387 /// \returns A 256-bit integer vector containing the same bitwise pattern as the 4388 /// parameter. 4389 static __inline __m256i __DEFAULT_FN_ATTRS 4390 _mm256_castpd_si256(__m256d __a) 4391 { 4392 return (__m256i)__a; 4393 } 4394 4395 /// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4396 /// floating-point vector of [4 x double]. 4397 /// 4398 /// \headerfile <x86intrin.h> 4399 /// 4400 /// This intrinsic has no corresponding instruction. 4401 /// 4402 /// \param __a 4403 /// A 256-bit floating-point vector of [8 x float]. 4404 /// \returns A 256-bit floating-point vector of [4 x double] containing the same 4405 /// bitwise pattern as the parameter. 4406 static __inline __m256d __DEFAULT_FN_ATTRS 4407 _mm256_castps_pd(__m256 __a) 4408 { 4409 return (__m256d)__a; 4410 } 4411 4412 /// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4413 /// integer vector. 4414 /// 4415 /// \headerfile <x86intrin.h> 4416 /// 4417 /// This intrinsic has no corresponding instruction. 4418 /// 4419 /// \param __a 4420 /// A 256-bit floating-point vector of [8 x float]. 4421 /// \returns A 256-bit integer vector containing the same bitwise pattern as the 4422 /// parameter. 4423 static __inline __m256i __DEFAULT_FN_ATTRS 4424 _mm256_castps_si256(__m256 __a) 4425 { 4426 return (__m256i)__a; 4427 } 4428 4429 /// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4430 /// of [8 x float]. 4431 /// 4432 /// \headerfile <x86intrin.h> 4433 /// 4434 /// This intrinsic has no corresponding instruction. 4435 /// 4436 /// \param __a 4437 /// A 256-bit integer vector. 4438 /// \returns A 256-bit floating-point vector of [8 x float] containing the same 4439 /// bitwise pattern as the parameter. 4440 static __inline __m256 __DEFAULT_FN_ATTRS 4441 _mm256_castsi256_ps(__m256i __a) 4442 { 4443 return (__m256)__a; 4444 } 4445 4446 /// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4447 /// of [4 x double]. 4448 /// 4449 /// \headerfile <x86intrin.h> 4450 /// 4451 /// This intrinsic has no corresponding instruction. 4452 /// 4453 /// \param __a 4454 /// A 256-bit integer vector. 4455 /// \returns A 256-bit floating-point vector of [4 x double] containing the same 4456 /// bitwise pattern as the parameter. 4457 static __inline __m256d __DEFAULT_FN_ATTRS 4458 _mm256_castsi256_pd(__m256i __a) 4459 { 4460 return (__m256d)__a; 4461 } 4462 4463 /// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4464 /// [4 x double] as a 128-bit floating-point vector of [2 x double]. 4465 /// 4466 /// \headerfile <x86intrin.h> 4467 /// 4468 /// This intrinsic has no corresponding instruction. 4469 /// 4470 /// \param __a 4471 /// A 256-bit floating-point vector of [4 x double]. 4472 /// \returns A 128-bit floating-point vector of [2 x double] containing the 4473 /// lower 128 bits of the parameter. 4474 static __inline __m128d __DEFAULT_FN_ATTRS 4475 _mm256_castpd256_pd128(__m256d __a) 4476 { 4477 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); 4478 } 4479 4480 /// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4481 /// [8 x float] as a 128-bit floating-point vector of [4 x float]. 4482 /// 4483 /// \headerfile <x86intrin.h> 4484 /// 4485 /// This intrinsic has no corresponding instruction. 4486 /// 4487 /// \param __a 4488 /// A 256-bit floating-point vector of [8 x float]. 4489 /// \returns A 128-bit floating-point vector of [4 x float] containing the 4490 /// lower 128 bits of the parameter. 4491 static __inline __m128 __DEFAULT_FN_ATTRS 4492 _mm256_castps256_ps128(__m256 __a) 4493 { 4494 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); 4495 } 4496 4497 /// \brief Truncates a 256-bit integer vector into a 128-bit integer vector. 4498 /// 4499 /// \headerfile <x86intrin.h> 4500 /// 4501 /// This intrinsic has no corresponding instruction. 4502 /// 4503 /// \param __a 4504 /// A 256-bit integer vector. 4505 /// \returns A 128-bit integer vector containing the lower 128 bits of the 4506 /// parameter. 4507 static __inline __m128i __DEFAULT_FN_ATTRS 4508 _mm256_castsi256_si128(__m256i __a) 4509 { 4510 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); 4511 } 4512 4513 /// \brief Constructs a 256-bit floating-point vector of [4 x double] from a 4514 /// 128-bit floating-point vector of [2 x double]. 4515 /// 4516 /// The lower 128 bits contain the value of the source vector. The contents 4517 /// of the upper 128 bits are undefined. 4518 /// 4519 /// \headerfile <x86intrin.h> 4520 /// 4521 /// This intrinsic has no corresponding instruction. 4522 /// 4523 /// \param __a 4524 /// A 128-bit vector of [2 x double]. 4525 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4526 /// contain the value of the parameter. The contents of the upper 128 bits 4527 /// are undefined. 4528 static __inline __m256d __DEFAULT_FN_ATTRS 4529 _mm256_castpd128_pd256(__m128d __a) 4530 { 4531 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); 4532 } 4533 4534 /// \brief Constructs a 256-bit floating-point vector of [8 x float] from a 4535 /// 128-bit floating-point vector of [4 x float]. 4536 /// 4537 /// The lower 128 bits contain the value of the source vector. The contents 4538 /// of the upper 128 bits are undefined. 4539 /// 4540 /// \headerfile <x86intrin.h> 4541 /// 4542 /// This intrinsic has no corresponding instruction. 4543 /// 4544 /// \param __a 4545 /// A 128-bit vector of [4 x float]. 4546 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4547 /// contain the value of the parameter. The contents of the upper 128 bits 4548 /// are undefined. 4549 static __inline __m256 __DEFAULT_FN_ATTRS 4550 _mm256_castps128_ps256(__m128 __a) 4551 { 4552 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); 4553 } 4554 4555 /// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. 4556 /// 4557 /// The lower 128 bits contain the value of the source vector. The contents 4558 /// of the upper 128 bits are undefined. 4559 /// 4560 /// \headerfile <x86intrin.h> 4561 /// 4562 /// This intrinsic has no corresponding instruction. 4563 /// 4564 /// \param __a 4565 /// A 128-bit integer vector. 4566 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4567 /// the parameter. The contents of the upper 128 bits are undefined. 4568 static __inline __m256i __DEFAULT_FN_ATTRS 4569 _mm256_castsi128_si256(__m128i __a) 4570 { 4571 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); 4572 } 4573 4574 /// \brief Constructs a 256-bit floating-point vector of [4 x double] from a 4575 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits 4576 /// contain the value of the source vector. The upper 128 bits are set 4577 /// to zero. 4578 /// 4579 /// \headerfile <x86intrin.h> 4580 /// 4581 /// This intrinsic has no corresponding instruction. 4582 /// 4583 /// \param __a 4584 /// A 128-bit vector of [2 x double]. 4585 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4586 /// contain the value of the parameter. The upper 128 bits are set to zero. 4587 static __inline __m256d __DEFAULT_FN_ATTRS 4588 _mm256_zextpd128_pd256(__m128d __a) 4589 { 4590 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); 4591 } 4592 4593 /// \brief Constructs a 256-bit floating-point vector of [8 x float] from a 4594 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 4595 /// the value of the source vector. The upper 128 bits are set to zero. 4596 /// 4597 /// \headerfile <x86intrin.h> 4598 /// 4599 /// This intrinsic has no corresponding instruction. 4600 /// 4601 /// \param __a 4602 /// A 128-bit vector of [4 x float]. 4603 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4604 /// contain the value of the parameter. The upper 128 bits are set to zero. 4605 static __inline __m256 __DEFAULT_FN_ATTRS 4606 _mm256_zextps128_ps256(__m128 __a) 4607 { 4608 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); 4609 } 4610 4611 /// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. 4612 /// The lower 128 bits contain the value of the source vector. The upper 4613 /// 128 bits are set to zero. 4614 /// 4615 /// \headerfile <x86intrin.h> 4616 /// 4617 /// This intrinsic has no corresponding instruction. 4618 /// 4619 /// \param __a 4620 /// A 128-bit integer vector. 4621 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4622 /// the parameter. The upper 128 bits are set to zero. 4623 static __inline __m256i __DEFAULT_FN_ATTRS 4624 _mm256_zextsi128_si256(__m128i __a) 4625 { 4626 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); 4627 } 4628 4629 /* 4630 Vector insert. 4631 We use macros rather than inlines because we only want to accept 4632 invocations where the immediate M is a constant expression. 4633 */ 4634 /// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating 4635 /// a 256-bit vector of [8 x float] given in the first parameter, and then 4636 /// replacing either the upper or the lower 128 bits with the contents of a 4637 /// 128-bit vector of [4 x float] in the second parameter. 4638 /// 4639 /// The immediate integer parameter determines between the upper or the lower 4640 /// 128 bits. 4641 /// 4642 /// \headerfile <x86intrin.h> 4643 /// 4644 /// \code 4645 /// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); 4646 /// \endcode 4647 /// 4648 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4649 /// 4650 /// \param V1 4651 /// A 256-bit vector of [8 x float]. This vector is copied to the result 4652 /// first, and then either the upper or the lower 128 bits of the result will 4653 /// be replaced by the contents of \a V2. 4654 /// \param V2 4655 /// A 128-bit vector of [4 x float]. The contents of this parameter are 4656 /// written to either the upper or the lower 128 bits of the result depending 4657 /// on the value of parameter \a M. 4658 /// \param M 4659 /// An immediate integer. The least significant bit determines how the values 4660 /// from the two parameters are interleaved: \n 4661 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4662 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4663 /// result. \n 4664 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4665 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4666 /// result. 4667 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 4668 #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ 4669 (__m256)__builtin_shufflevector( \ 4670 (__v8sf)(__m256)(V1), \ 4671 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ 4672 (((M) & 1) ? 0 : 8), \ 4673 (((M) & 1) ? 1 : 9), \ 4674 (((M) & 1) ? 2 : 10), \ 4675 (((M) & 1) ? 3 : 11), \ 4676 (((M) & 1) ? 8 : 4), \ 4677 (((M) & 1) ? 9 : 5), \ 4678 (((M) & 1) ? 10 : 6), \ 4679 (((M) & 1) ? 11 : 7) );}) 4680 4681 /// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating 4682 /// a 256-bit vector of [4 x double] given in the first parameter, and then 4683 /// replacing either the upper or the lower 128 bits with the contents of a 4684 /// 128-bit vector of [2 x double] in the second parameter. 4685 /// 4686 /// The immediate integer parameter determines between the upper or the lower 4687 /// 128 bits. 4688 /// 4689 /// \headerfile <x86intrin.h> 4690 /// 4691 /// \code 4692 /// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); 4693 /// \endcode 4694 /// 4695 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4696 /// 4697 /// \param V1 4698 /// A 256-bit vector of [4 x double]. This vector is copied to the result 4699 /// first, and then either the upper or the lower 128 bits of the result will 4700 /// be replaced by the contents of \a V2. 4701 /// \param V2 4702 /// A 128-bit vector of [2 x double]. The contents of this parameter are 4703 /// written to either the upper or the lower 128 bits of the result depending 4704 /// on the value of parameter \a M. 4705 /// \param M 4706 /// An immediate integer. The least significant bit determines how the values 4707 /// from the two parameters are interleaved: \n 4708 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4709 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4710 /// result. \n 4711 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4712 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4713 /// result. 4714 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 4715 #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ 4716 (__m256d)__builtin_shufflevector( \ 4717 (__v4df)(__m256d)(V1), \ 4718 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ 4719 (((M) & 1) ? 0 : 4), \ 4720 (((M) & 1) ? 1 : 5), \ 4721 (((M) & 1) ? 4 : 2), \ 4722 (((M) & 1) ? 5 : 3) );}) 4723 4724 /// \brief Constructs a new 256-bit integer vector by first duplicating a 4725 /// 256-bit integer vector given in the first parameter, and then replacing 4726 /// either the upper or the lower 128 bits with the contents of a 128-bit 4727 /// integer vector in the second parameter. 4728 /// 4729 /// The immediate integer parameter determines between the upper or the lower 4730 /// 128 bits. 4731 /// 4732 /// \headerfile <x86intrin.h> 4733 /// 4734 /// \code 4735 /// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); 4736 /// \endcode 4737 /// 4738 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4739 /// 4740 /// \param V1 4741 /// A 256-bit integer vector. This vector is copied to the result first, and 4742 /// then either the upper or the lower 128 bits of the result will be 4743 /// replaced by the contents of \a V2. 4744 /// \param V2 4745 /// A 128-bit integer vector. The contents of this parameter are written to 4746 /// either the upper or the lower 128 bits of the result depending on the 4747 /// value of parameter \a M. 4748 /// \param M 4749 /// An immediate integer. The least significant bit determines how the values 4750 /// from the two parameters are interleaved: \n 4751 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4752 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4753 /// result. \n 4754 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4755 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4756 /// result. 4757 /// \returns A 256-bit integer vector containing the interleaved values. 4758 #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ 4759 (__m256i)__builtin_shufflevector( \ 4760 (__v4di)(__m256i)(V1), \ 4761 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ 4762 (((M) & 1) ? 0 : 4), \ 4763 (((M) & 1) ? 1 : 5), \ 4764 (((M) & 1) ? 4 : 2), \ 4765 (((M) & 1) ? 5 : 3) );}) 4766 4767 /* 4768 Vector extract. 4769 We use macros rather than inlines because we only want to accept 4770 invocations where the immediate M is a constant expression. 4771 */ 4772 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4773 /// of [8 x float], as determined by the immediate integer parameter, and 4774 /// returns the extracted bits as a 128-bit vector of [4 x float]. 4775 /// 4776 /// \headerfile <x86intrin.h> 4777 /// 4778 /// \code 4779 /// __m128 _mm256_extractf128_ps(__m256 V, const int M); 4780 /// \endcode 4781 /// 4782 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4783 /// 4784 /// \param V 4785 /// A 256-bit vector of [8 x float]. 4786 /// \param M 4787 /// An immediate integer. The least significant bit determines which bits are 4788 /// extracted from the first parameter: \n 4789 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4790 /// result. \n 4791 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4792 /// \returns A 128-bit vector of [4 x float] containing the extracted bits. 4793 #define _mm256_extractf128_ps(V, M) __extension__ ({ \ 4794 (__m128)__builtin_shufflevector( \ 4795 (__v8sf)(__m256)(V), \ 4796 (__v8sf)(_mm256_undefined_ps()), \ 4797 (((M) & 1) ? 4 : 0), \ 4798 (((M) & 1) ? 5 : 1), \ 4799 (((M) & 1) ? 6 : 2), \ 4800 (((M) & 1) ? 7 : 3) );}) 4801 4802 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4803 /// of [4 x double], as determined by the immediate integer parameter, and 4804 /// returns the extracted bits as a 128-bit vector of [2 x double]. 4805 /// 4806 /// \headerfile <x86intrin.h> 4807 /// 4808 /// \code 4809 /// __m128d _mm256_extractf128_pd(__m256d V, const int M); 4810 /// \endcode 4811 /// 4812 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4813 /// 4814 /// \param V 4815 /// A 256-bit vector of [4 x double]. 4816 /// \param M 4817 /// An immediate integer. The least significant bit determines which bits are 4818 /// extracted from the first parameter: \n 4819 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4820 /// result. \n 4821 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4822 /// \returns A 128-bit vector of [2 x double] containing the extracted bits. 4823 #define _mm256_extractf128_pd(V, M) __extension__ ({ \ 4824 (__m128d)__builtin_shufflevector( \ 4825 (__v4df)(__m256d)(V), \ 4826 (__v4df)(_mm256_undefined_pd()), \ 4827 (((M) & 1) ? 2 : 0), \ 4828 (((M) & 1) ? 3 : 1) );}) 4829 4830 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit 4831 /// integer vector, as determined by the immediate integer parameter, and 4832 /// returns the extracted bits as a 128-bit integer vector. 4833 /// 4834 /// \headerfile <x86intrin.h> 4835 /// 4836 /// \code 4837 /// __m128i _mm256_extractf128_si256(__m256i V, const int M); 4838 /// \endcode 4839 /// 4840 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4841 /// 4842 /// \param V 4843 /// A 256-bit integer vector. 4844 /// \param M 4845 /// An immediate integer. The least significant bit determines which bits are 4846 /// extracted from the first parameter: \n 4847 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4848 /// result. \n 4849 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4850 /// \returns A 128-bit integer vector containing the extracted bits. 4851 #define _mm256_extractf128_si256(V, M) __extension__ ({ \ 4852 (__m128i)__builtin_shufflevector( \ 4853 (__v4di)(__m256i)(V), \ 4854 (__v4di)(_mm256_undefined_si256()), \ 4855 (((M) & 1) ? 2 : 0), \ 4856 (((M) & 1) ? 3 : 1) );}) 4857 4858 /* SIMD load ops (unaligned) */ 4859 /// \brief Loads two 128-bit floating-point vectors of [4 x float] from 4860 /// unaligned memory locations and constructs a 256-bit floating-point vector 4861 /// of [8 x float] by concatenating the two 128-bit vectors. 4862 /// 4863 /// \headerfile <x86intrin.h> 4864 /// 4865 /// This intrinsic corresponds to load instructions followed by the 4866 /// <c> VINSERTF128 </c> instruction. 4867 /// 4868 /// \param __addr_hi 4869 /// A pointer to a 128-bit memory location containing 4 consecutive 4870 /// single-precision floating-point values. These values are to be copied to 4871 /// bits[255:128] of the result. The address of the memory location does not 4872 /// have to be aligned. 4873 /// \param __addr_lo 4874 /// A pointer to a 128-bit memory location containing 4 consecutive 4875 /// single-precision floating-point values. These values are to be copied to 4876 /// bits[127:0] of the result. The address of the memory location does not 4877 /// have to be aligned. 4878 /// \returns A 256-bit floating-point vector of [8 x float] containing the 4879 /// concatenated result. 4880 static __inline __m256 __DEFAULT_FN_ATTRS 4881 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 4882 { 4883 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo)); 4884 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); 4885 } 4886 4887 /// \brief Loads two 128-bit floating-point vectors of [2 x double] from 4888 /// unaligned memory locations and constructs a 256-bit floating-point vector 4889 /// of [4 x double] by concatenating the two 128-bit vectors. 4890 /// 4891 /// \headerfile <x86intrin.h> 4892 /// 4893 /// This intrinsic corresponds to load instructions followed by the 4894 /// <c> VINSERTF128 </c> instruction. 4895 /// 4896 /// \param __addr_hi 4897 /// A pointer to a 128-bit memory location containing two consecutive 4898 /// double-precision floating-point values. These values are to be copied to 4899 /// bits[255:128] of the result. The address of the memory location does not 4900 /// have to be aligned. 4901 /// \param __addr_lo 4902 /// A pointer to a 128-bit memory location containing two consecutive 4903 /// double-precision floating-point values. These values are to be copied to 4904 /// bits[127:0] of the result. The address of the memory location does not 4905 /// have to be aligned. 4906 /// \returns A 256-bit floating-point vector of [4 x double] containing the 4907 /// concatenated result. 4908 static __inline __m256d __DEFAULT_FN_ATTRS 4909 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 4910 { 4911 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo)); 4912 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); 4913 } 4914 4915 /// \brief Loads two 128-bit integer vectors from unaligned memory locations and 4916 /// constructs a 256-bit integer vector by concatenating the two 128-bit 4917 /// vectors. 4918 /// 4919 /// \headerfile <x86intrin.h> 4920 /// 4921 /// This intrinsic corresponds to load instructions followed by the 4922 /// <c> VINSERTF128 </c> instruction. 4923 /// 4924 /// \param __addr_hi 4925 /// A pointer to a 128-bit memory location containing a 128-bit integer 4926 /// vector. This vector is to be copied to bits[255:128] of the result. The 4927 /// address of the memory location does not have to be aligned. 4928 /// \param __addr_lo 4929 /// A pointer to a 128-bit memory location containing a 128-bit integer 4930 /// vector. This vector is to be copied to bits[127:0] of the result. The 4931 /// address of the memory location does not have to be aligned. 4932 /// \returns A 256-bit integer vector containing the concatenated result. 4933 static __inline __m256i __DEFAULT_FN_ATTRS 4934 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) 4935 { 4936 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); 4937 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); 4938 } 4939 4940 /* SIMD store ops (unaligned) */ 4941 /// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4942 /// vector of [8 x float] into two different unaligned memory locations. 4943 /// 4944 /// \headerfile <x86intrin.h> 4945 /// 4946 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4947 /// store instructions. 4948 /// 4949 /// \param __addr_hi 4950 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4951 /// copied to this memory location. The address of this memory location does 4952 /// not have to be aligned. 4953 /// \param __addr_lo 4954 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4955 /// copied to this memory location. The address of this memory location does 4956 /// not have to be aligned. 4957 /// \param __a 4958 /// A 256-bit floating-point vector of [8 x float]. 4959 static __inline void __DEFAULT_FN_ATTRS 4960 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 4961 { 4962 __m128 __v128; 4963 4964 __v128 = _mm256_castps256_ps128(__a); 4965 _mm_storeu_ps(__addr_lo, __v128); 4966 __v128 = _mm256_extractf128_ps(__a, 1); 4967 _mm_storeu_ps(__addr_hi, __v128); 4968 } 4969 4970 /// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4971 /// vector of [4 x double] into two different unaligned memory locations. 4972 /// 4973 /// \headerfile <x86intrin.h> 4974 /// 4975 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4976 /// store instructions. 4977 /// 4978 /// \param __addr_hi 4979 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4980 /// copied to this memory location. The address of this memory location does 4981 /// not have to be aligned. 4982 /// \param __addr_lo 4983 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4984 /// copied to this memory location. The address of this memory location does 4985 /// not have to be aligned. 4986 /// \param __a 4987 /// A 256-bit floating-point vector of [4 x double]. 4988 static __inline void __DEFAULT_FN_ATTRS 4989 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 4990 { 4991 __m128d __v128; 4992 4993 __v128 = _mm256_castpd256_pd128(__a); 4994 _mm_storeu_pd(__addr_lo, __v128); 4995 __v128 = _mm256_extractf128_pd(__a, 1); 4996 _mm_storeu_pd(__addr_hi, __v128); 4997 } 4998 4999 /// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into 5000 /// two different unaligned memory locations. 5001 /// 5002 /// \headerfile <x86intrin.h> 5003 /// 5004 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 5005 /// store instructions. 5006 /// 5007 /// \param __addr_hi 5008 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 5009 /// copied to this memory location. The address of this memory location does 5010 /// not have to be aligned. 5011 /// \param __addr_lo 5012 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 5013 /// copied to this memory location. The address of this memory location does 5014 /// not have to be aligned. 5015 /// \param __a 5016 /// A 256-bit integer vector. 5017 static __inline void __DEFAULT_FN_ATTRS 5018 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) 5019 { 5020 __m128i __v128; 5021 5022 __v128 = _mm256_castsi256_si128(__a); 5023 _mm_storeu_si128(__addr_lo, __v128); 5024 __v128 = _mm256_extractf128_si256(__a, 1); 5025 _mm_storeu_si128(__addr_hi, __v128); 5026 } 5027 5028 /// \brief Constructs a 256-bit floating-point vector of [8 x float] by 5029 /// concatenating two 128-bit floating-point vectors of [4 x float]. 5030 /// 5031 /// \headerfile <x86intrin.h> 5032 /// 5033 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5034 /// 5035 /// \param __hi 5036 /// A 128-bit floating-point vector of [4 x float] to be copied to the upper 5037 /// 128 bits of the result. 5038 /// \param __lo 5039 /// A 128-bit floating-point vector of [4 x float] to be copied to the lower 5040 /// 128 bits of the result. 5041 /// \returns A 256-bit floating-point vector of [8 x float] containing the 5042 /// concatenated result. 5043 static __inline __m256 __DEFAULT_FN_ATTRS 5044 _mm256_set_m128 (__m128 __hi, __m128 __lo) 5045 { 5046 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); 5047 } 5048 5049 /// \brief Constructs a 256-bit floating-point vector of [4 x double] by 5050 /// concatenating two 128-bit floating-point vectors of [2 x double]. 5051 /// 5052 /// \headerfile <x86intrin.h> 5053 /// 5054 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5055 /// 5056 /// \param __hi 5057 /// A 128-bit floating-point vector of [2 x double] to be copied to the upper 5058 /// 128 bits of the result. 5059 /// \param __lo 5060 /// A 128-bit floating-point vector of [2 x double] to be copied to the lower 5061 /// 128 bits of the result. 5062 /// \returns A 256-bit floating-point vector of [4 x double] containing the 5063 /// concatenated result. 5064 static __inline __m256d __DEFAULT_FN_ATTRS 5065 _mm256_set_m128d (__m128d __hi, __m128d __lo) 5066 { 5067 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5068 } 5069 5070 /// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 5071 /// integer vectors. 5072 /// 5073 /// \headerfile <x86intrin.h> 5074 /// 5075 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5076 /// 5077 /// \param __hi 5078 /// A 128-bit integer vector to be copied to the upper 128 bits of the 5079 /// result. 5080 /// \param __lo 5081 /// A 128-bit integer vector to be copied to the lower 128 bits of the 5082 /// result. 5083 /// \returns A 256-bit integer vector containing the concatenated result. 5084 static __inline __m256i __DEFAULT_FN_ATTRS 5085 _mm256_set_m128i (__m128i __hi, __m128i __lo) 5086 { 5087 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5088 } 5089 5090 /// \brief Constructs a 256-bit floating-point vector of [8 x float] by 5091 /// concatenating two 128-bit floating-point vectors of [4 x float]. This is 5092 /// similar to _mm256_set_m128, but the order of the input parameters is 5093 /// swapped. 5094 /// 5095 /// \headerfile <x86intrin.h> 5096 /// 5097 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5098 /// 5099 /// \param __lo 5100 /// A 128-bit floating-point vector of [4 x float] to be copied to the lower 5101 /// 128 bits of the result. 5102 /// \param __hi 5103 /// A 128-bit floating-point vector of [4 x float] to be copied to the upper 5104 /// 128 bits of the result. 5105 /// \returns A 256-bit floating-point vector of [8 x float] containing the 5106 /// concatenated result. 5107 static __inline __m256 __DEFAULT_FN_ATTRS 5108 _mm256_setr_m128 (__m128 __lo, __m128 __hi) 5109 { 5110 return _mm256_set_m128(__hi, __lo); 5111 } 5112 5113 /// \brief Constructs a 256-bit floating-point vector of [4 x double] by 5114 /// concatenating two 128-bit floating-point vectors of [2 x double]. This is 5115 /// similar to _mm256_set_m128d, but the order of the input parameters is 5116 /// swapped. 5117 /// 5118 /// \headerfile <x86intrin.h> 5119 /// 5120 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5121 /// 5122 /// \param __lo 5123 /// A 128-bit floating-point vector of [2 x double] to be copied to the lower 5124 /// 128 bits of the result. 5125 /// \param __hi 5126 /// A 128-bit floating-point vector of [2 x double] to be copied to the upper 5127 /// 128 bits of the result. 5128 /// \returns A 256-bit floating-point vector of [4 x double] containing the 5129 /// concatenated result. 5130 static __inline __m256d __DEFAULT_FN_ATTRS 5131 _mm256_setr_m128d (__m128d __lo, __m128d __hi) 5132 { 5133 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5134 } 5135 5136 /// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 5137 /// integer vectors. This is similar to _mm256_set_m128i, but the order of 5138 /// the input parameters is swapped. 5139 /// 5140 /// \headerfile <x86intrin.h> 5141 /// 5142 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5143 /// 5144 /// \param __lo 5145 /// A 128-bit integer vector to be copied to the lower 128 bits of the 5146 /// result. 5147 /// \param __hi 5148 /// A 128-bit integer vector to be copied to the upper 128 bits of the 5149 /// result. 5150 /// \returns A 256-bit integer vector containing the concatenated result. 5151 static __inline __m256i __DEFAULT_FN_ATTRS 5152 _mm256_setr_m128i (__m128i __lo, __m128i __hi) 5153 { 5154 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5155 } 5156 5157 #undef __DEFAULT_FN_ATTRS 5158 5159 #endif /* __AVXINTRIN_H */ 5160