1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __IMMINTRIN_H 25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26 #endif 27 28 #ifndef __AVXINTRIN_H 29 #define __AVXINTRIN_H 30 31 typedef double __v4df __attribute__ ((__vector_size__ (32))); 32 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 33 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 34 typedef int __v8si __attribute__ ((__vector_size__ (32))); 35 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 36 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 37 38 /* Unsigned types */ 39 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 40 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 41 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 42 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 43 44 /* We need an explicitly signed variant for char. Note that this shouldn't 45 * appear in the interface though. */ 46 typedef signed char __v32qs __attribute__((__vector_size__(32))); 47 48 typedef float __m256 __attribute__ ((__vector_size__ (32))); 49 typedef double __m256d __attribute__((__vector_size__(32))); 50 typedef long long __m256i __attribute__((__vector_size__(32))); 51 52 /* Define the default attributes for the functions in this file. */ 53 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 54 55 /* Arithmetic */ 56 /// \brief Adds two 256-bit vectors of [4 x double]. 57 /// 58 /// \headerfile <x86intrin.h> 59 /// 60 /// This intrinsic corresponds to the \c VADDPD / ADDPD instruction. 61 /// 62 /// \param __a 63 /// A 256-bit vector of [4 x double] containing one of the source operands. 64 /// \param __b 65 /// A 256-bit vector of [4 x double] containing one of the source operands. 66 /// \returns A 256-bit vector of [4 x double] containing the sums of both 67 /// operands. 68 static __inline __m256d __DEFAULT_FN_ATTRS 69 _mm256_add_pd(__m256d __a, __m256d __b) 70 { 71 return (__m256d)((__v4df)__a+(__v4df)__b); 72 } 73 74 /// \brief Adds two 256-bit vectors of [8 x float]. 75 /// 76 /// \headerfile <x86intrin.h> 77 /// 78 /// This intrinsic corresponds to the \c VADDPS / ADDPS instruction. 79 /// 80 /// \param __a 81 /// A 256-bit vector of [8 x float] containing one of the source operands. 82 /// \param __b 83 /// A 256-bit vector of [8 x float] containing one of the source operands. 84 /// \returns A 256-bit vector of [8 x float] containing the sums of both 85 /// operands. 86 static __inline __m256 __DEFAULT_FN_ATTRS 87 _mm256_add_ps(__m256 __a, __m256 __b) 88 { 89 return (__m256)((__v8sf)__a+(__v8sf)__b); 90 } 91 92 /// \brief Subtracts two 256-bit vectors of [4 x double]. 93 /// 94 /// \headerfile <x86intrin.h> 95 /// 96 /// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction. 97 /// 98 /// \param __a 99 /// A 256-bit vector of [4 x double] containing the minuend. 100 /// \param __b 101 /// A 256-bit vector of [4 x double] containing the subtrahend. 102 /// \returns A 256-bit vector of [4 x double] containing the differences between 103 /// both operands. 104 static __inline __m256d __DEFAULT_FN_ATTRS 105 _mm256_sub_pd(__m256d __a, __m256d __b) 106 { 107 return (__m256d)((__v4df)__a-(__v4df)__b); 108 } 109 110 /// \brief Subtracts two 256-bit vectors of [8 x float]. 111 /// 112 /// \headerfile <x86intrin.h> 113 /// 114 /// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction. 115 /// 116 /// \param __a 117 /// A 256-bit vector of [8 x float] containing the minuend. 118 /// \param __b 119 /// A 256-bit vector of [8 x float] containing the subtrahend. 120 /// \returns A 256-bit vector of [8 x float] containing the differences between 121 /// both operands. 122 static __inline __m256 __DEFAULT_FN_ATTRS 123 _mm256_sub_ps(__m256 __a, __m256 __b) 124 { 125 return (__m256)((__v8sf)__a-(__v8sf)__b); 126 } 127 128 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 129 /// two 256-bit vectors of [4 x double]. 130 /// 131 /// \headerfile <x86intrin.h> 132 /// 133 /// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction. 134 /// 135 /// \param __a 136 /// A 256-bit vector of [4 x double] containing the left source operand. 137 /// \param __b 138 /// A 256-bit vector of [4 x double] containing the right source operand. 139 /// \returns A 256-bit vector of [4 x double] containing the alternating sums 140 /// and differences between both operands. 141 static __inline __m256d __DEFAULT_FN_ATTRS 142 _mm256_addsub_pd(__m256d __a, __m256d __b) 143 { 144 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 145 } 146 147 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 148 /// two 256-bit vectors of [8 x float]. 149 /// 150 /// \headerfile <x86intrin.h> 151 /// 152 /// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction. 153 /// 154 /// \param __a 155 /// A 256-bit vector of [8 x float] containing the left source operand. 156 /// \param __b 157 /// A 256-bit vector of [8 x float] containing the right source operand. 158 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and 159 /// differences between both operands. 160 static __inline __m256 __DEFAULT_FN_ATTRS 161 _mm256_addsub_ps(__m256 __a, __m256 __b) 162 { 163 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 164 } 165 166 /// \brief Divides two 256-bit vectors of [4 x double]. 167 /// 168 /// \headerfile <x86intrin.h> 169 /// 170 /// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction. 171 /// 172 /// \param __a 173 /// A 256-bit vector of [4 x double] containing the dividend. 174 /// \param __b 175 /// A 256-bit vector of [4 x double] containing the divisor. 176 /// \returns A 256-bit vector of [4 x double] containing the quotients of both 177 /// operands. 178 static __inline __m256d __DEFAULT_FN_ATTRS 179 _mm256_div_pd(__m256d __a, __m256d __b) 180 { 181 return (__m256d)((__v4df)__a/(__v4df)__b); 182 } 183 184 /// \brief Divides two 256-bit vectors of [8 x float]. 185 /// 186 /// \headerfile <x86intrin.h> 187 /// 188 /// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction. 189 /// 190 /// \param __a 191 /// A 256-bit vector of [8 x float] containing the dividend. 192 /// \param __b 193 /// A 256-bit vector of [8 x float] containing the divisor. 194 /// \returns A 256-bit vector of [8 x float] containing the quotients of both 195 /// operands. 196 static __inline __m256 __DEFAULT_FN_ATTRS 197 _mm256_div_ps(__m256 __a, __m256 __b) 198 { 199 return (__m256)((__v8sf)__a/(__v8sf)__b); 200 } 201 202 /// \brief Compares two 256-bit vectors of [4 x double] and returns the greater 203 /// of each pair of values. 204 /// 205 /// \headerfile <x86intrin.h> 206 /// 207 /// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction. 208 /// 209 /// \param __a 210 /// A 256-bit vector of [4 x double] containing one of the operands. 211 /// \param __b 212 /// A 256-bit vector of [4 x double] containing one of the operands. 213 /// \returns A 256-bit vector of [4 x double] containing the maximum values 214 /// between both operands. 215 static __inline __m256d __DEFAULT_FN_ATTRS 216 _mm256_max_pd(__m256d __a, __m256d __b) 217 { 218 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 219 } 220 221 /// \brief Compares two 256-bit vectors of [8 x float] and returns the greater 222 /// of each pair of values. 223 /// 224 /// \headerfile <x86intrin.h> 225 /// 226 /// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction. 227 /// 228 /// \param __a 229 /// A 256-bit vector of [8 x float] containing one of the operands. 230 /// \param __b 231 /// A 256-bit vector of [8 x float] containing one of the operands. 232 /// \returns A 256-bit vector of [8 x float] containing the maximum values 233 /// between both operands. 234 static __inline __m256 __DEFAULT_FN_ATTRS 235 _mm256_max_ps(__m256 __a, __m256 __b) 236 { 237 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 238 } 239 240 /// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser 241 /// of each pair of values. 242 /// 243 /// \headerfile <x86intrin.h> 244 /// 245 /// This intrinsic corresponds to the \c VMINPD / MINPD instruction. 246 /// 247 /// \param __a 248 /// A 256-bit vector of [4 x double] containing one of the operands. 249 /// \param __b 250 /// A 256-bit vector of [4 x double] containing one of the operands. 251 /// \returns A 256-bit vector of [4 x double] containing the minimum values 252 /// between both operands. 253 static __inline __m256d __DEFAULT_FN_ATTRS 254 _mm256_min_pd(__m256d __a, __m256d __b) 255 { 256 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 257 } 258 259 /// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser 260 /// of each pair of values. 261 /// 262 /// \headerfile <x86intrin.h> 263 /// 264 /// This intrinsic corresponds to the \c VMINPS / MINPS instruction. 265 /// 266 /// \param __a 267 /// A 256-bit vector of [8 x float] containing one of the operands. 268 /// \param __b 269 /// A 256-bit vector of [8 x float] containing one of the operands. 270 /// \returns A 256-bit vector of [8 x float] containing the minimum values 271 /// between both operands. 272 static __inline __m256 __DEFAULT_FN_ATTRS 273 _mm256_min_ps(__m256 __a, __m256 __b) 274 { 275 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 276 } 277 278 /// \brief Multiplies two 256-bit vectors of [4 x double]. 279 /// 280 /// \headerfile <x86intrin.h> 281 /// 282 /// This intrinsic corresponds to the \c VMULPD / MULPD instruction. 283 /// 284 /// \param __a 285 /// A 256-bit vector of [4 x double] containing one of the operands. 286 /// \param __b 287 /// A 256-bit vector of [4 x double] containing one of the operands. 288 /// \returns A 256-bit vector of [4 x double] containing the products of both 289 /// operands. 290 static __inline __m256d __DEFAULT_FN_ATTRS 291 _mm256_mul_pd(__m256d __a, __m256d __b) 292 { 293 return (__m256d)((__v4df)__a * (__v4df)__b); 294 } 295 296 /// \brief Multiplies two 256-bit vectors of [8 x float]. 297 /// 298 /// \headerfile <x86intrin.h> 299 /// 300 /// This intrinsic corresponds to the \c VMULPS / MULPS instruction. 301 /// 302 /// \param __a 303 /// A 256-bit vector of [8 x float] containing one of the operands. 304 /// \param __b 305 /// A 256-bit vector of [8 x float] containing one of the operands. 306 /// \returns A 256-bit vector of [8 x float] containing the products of both 307 /// operands. 308 static __inline __m256 __DEFAULT_FN_ATTRS 309 _mm256_mul_ps(__m256 __a, __m256 __b) 310 { 311 return (__m256)((__v8sf)__a * (__v8sf)__b); 312 } 313 314 /// \brief Calculates the square roots of the values in a 256-bit vector of 315 /// [4 x double]. 316 /// 317 /// \headerfile <x86intrin.h> 318 /// 319 /// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction. 320 /// 321 /// \param __a 322 /// A 256-bit vector of [4 x double]. 323 /// \returns A 256-bit vector of [4 x double] containing the square roots of the 324 /// values in the operand. 325 static __inline __m256d __DEFAULT_FN_ATTRS 326 _mm256_sqrt_pd(__m256d __a) 327 { 328 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 329 } 330 331 /// \brief Calculates the square roots of the values in a 256-bit vector of 332 /// [8 x float]. 333 /// 334 /// \headerfile <x86intrin.h> 335 /// 336 /// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction. 337 /// 338 /// \param __a 339 /// A 256-bit vector of [8 x float]. 340 /// \returns A 256-bit vector of [8 x float] containing the square roots of the 341 /// values in the operand. 342 static __inline __m256 __DEFAULT_FN_ATTRS 343 _mm256_sqrt_ps(__m256 __a) 344 { 345 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 346 } 347 348 /// \brief Calculates the reciprocal square roots of the values in a 256-bit 349 /// vector of [8 x float]. 350 /// 351 /// \headerfile <x86intrin.h> 352 /// 353 /// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction. 354 /// 355 /// \param __a 356 /// A 256-bit vector of [8 x float]. 357 /// \returns A 256-bit vector of [8 x float] containing the reciprocal square 358 /// roots of the values in the operand. 359 static __inline __m256 __DEFAULT_FN_ATTRS 360 _mm256_rsqrt_ps(__m256 __a) 361 { 362 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 363 } 364 365 /// \brief Calculates the reciprocals of the values in a 256-bit vector of 366 /// [8 x float]. 367 /// 368 /// \headerfile <x86intrin.h> 369 /// 370 /// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction. 371 /// 372 /// \param __a 373 /// A 256-bit vector of [8 x float]. 374 /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 375 /// values in the operand. 376 static __inline __m256 __DEFAULT_FN_ATTRS 377 _mm256_rcp_ps(__m256 __a) 378 { 379 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 380 } 381 382 /// \brief Rounds the values in a 256-bit vector of [4 x double] as specified 383 /// by the byte operand. The source values are rounded to integer values and 384 /// returned as 64-bit double-precision floating-point values. 385 /// 386 /// \headerfile <x86intrin.h> 387 /// 388 /// \code 389 /// __m256d _mm256_round_pd(__m256d V, const int M); 390 /// \endcode 391 /// 392 /// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction. 393 /// 394 /// \param V 395 /// A 256-bit vector of [4 x double]. 396 /// \param M 397 /// An integer value that specifies the rounding operation. 398 /// Bits [7:4] are reserved. 399 /// Bit [3] is a precision exception value: 400 /// 0: A normal PE exception is used. 401 /// 1: The PE field is not updated. 402 /// Bit [2] is the rounding control source: 403 /// 0: Use bits [1:0] of M. 404 /// 1: Use the current MXCSR setting. 405 /// Bits [1:0] contain the rounding control definition: 406 /// 00: Nearest. 407 /// 01: Downward (toward negative infinity). 408 /// 10: Upward (toward positive infinity). 409 /// 11: Truncated. 410 /// \returns A 256-bit vector of [4 x double] containing the rounded values. 411 #define _mm256_round_pd(V, M) __extension__ ({ \ 412 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) 413 414 /// \brief Rounds the values stored in a 256-bit vector of [8 x float] as 415 /// specified by the byte operand. The source values are rounded to integer 416 /// values and returned as floating-point values. 417 /// 418 /// \headerfile <x86intrin.h> 419 /// 420 /// \code 421 /// __m256 _mm256_round_ps(__m256 V, const int M); 422 /// \endcode 423 /// 424 /// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction. 425 /// 426 /// \param V 427 /// A 256-bit vector of [8 x float]. 428 /// \param M 429 /// An integer value that specifies the rounding operation. 430 /// Bits [7:4] are reserved. 431 /// Bit [3] is a precision exception value: 432 /// 0: A normal PE exception is used. 433 /// 1: The PE field is not updated. 434 /// Bit [2] is the rounding control source: 435 /// 0: Use bits [1:0] of M. 436 /// 1: Use the current MXCSR setting. 437 /// Bits [1:0] contain the rounding control definition: 438 /// 00: Nearest. 439 /// 01: Downward (toward negative infinity). 440 /// 10: Upward (toward positive infinity). 441 /// 11: Truncated. 442 /// \returns A 256-bit vector of [8 x float] containing the rounded values. 443 #define _mm256_round_ps(V, M) __extension__ ({ \ 444 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) 445 446 /// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The 447 /// source values are rounded up to integer values and returned as 64-bit 448 /// double-precision floating-point values. 449 /// 450 /// \headerfile <x86intrin.h> 451 /// 452 /// \code 453 /// __m256d _mm256_ceil_pd(__m256d V); 454 /// \endcode 455 /// 456 /// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction. 457 /// 458 /// \param V 459 /// A 256-bit vector of [4 x double]. 460 /// \returns A 256-bit vector of [4 x double] containing the rounded up values. 461 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 462 463 /// \brief Rounds down the values stored in a 256-bit vector of [4 x double]. 464 /// The source values are rounded down to integer values and returned as 465 /// 64-bit double-precision floating-point values. 466 /// 467 /// \headerfile <x86intrin.h> 468 /// 469 /// \code 470 /// __m256d _mm256_floor_pd(__m256d V); 471 /// \endcode 472 /// 473 /// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction. 474 /// 475 /// \param V 476 /// A 256-bit vector of [4 x double]. 477 /// \returns A 256-bit vector of [4 x double] containing the rounded down 478 /// values. 479 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 480 481 /// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The 482 /// source values are rounded up to integer values and returned as 483 /// floating-point values. 484 /// 485 /// \headerfile <x86intrin.h> 486 /// 487 /// \code 488 /// __m256 _mm256_ceil_ps(__m256 V); 489 /// \endcode 490 /// 491 /// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction. 492 /// 493 /// \param V 494 /// A 256-bit vector of [8 x float]. 495 /// \returns A 256-bit vector of [8 x float] containing the rounded up values. 496 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 497 498 /// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The 499 /// source values are rounded down to integer values and returned as 500 /// floating-point values. 501 /// 502 /// \headerfile <x86intrin.h> 503 /// 504 /// \code 505 /// __m256 _mm256_floor_ps(__m256 V); 506 /// \endcode 507 /// 508 /// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction. 509 /// 510 /// \param V 511 /// A 256-bit vector of [8 x float]. 512 /// \returns A 256-bit vector of [8 x float] containing the rounded down values. 513 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 514 515 /* Logical */ 516 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double]. 517 /// 518 /// \headerfile <x86intrin.h> 519 /// 520 /// This intrinsic corresponds to the \c VANDPD / ANDPD instruction. 521 /// 522 /// \param __a 523 /// A 256-bit vector of [4 x double] containing one of the source operands. 524 /// \param __b 525 /// A 256-bit vector of [4 x double] containing one of the source operands. 526 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 527 /// values between both operands. 528 static __inline __m256d __DEFAULT_FN_ATTRS 529 _mm256_and_pd(__m256d __a, __m256d __b) 530 { 531 return (__m256d)((__v4du)__a & (__v4du)__b); 532 } 533 534 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float]. 535 /// 536 /// \headerfile <x86intrin.h> 537 /// 538 /// This intrinsic corresponds to the \c VANDPS / ANDPS instruction. 539 /// 540 /// \param __a 541 /// A 256-bit vector of [8 x float] containing one of the source operands. 542 /// \param __b 543 /// A 256-bit vector of [8 x float] containing one of the source operands. 544 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 545 /// values between both operands. 546 static __inline __m256 __DEFAULT_FN_ATTRS 547 _mm256_and_ps(__m256 __a, __m256 __b) 548 { 549 return (__m256)((__v8su)__a & (__v8su)__b); 550 } 551 552 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using 553 /// the one's complement of the values contained in the first source operand. 554 /// 555 /// \headerfile <x86intrin.h> 556 /// 557 /// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction. 558 /// 559 /// \param __a 560 /// A 256-bit vector of [4 x double] containing the left source operand. The 561 /// one's complement of this value is used in the bitwise AND. 562 /// \param __b 563 /// A 256-bit vector of [4 x double] containing the right source operand. 564 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 565 /// values of the second operand and the one's complement of the first 566 /// operand. 567 static __inline __m256d __DEFAULT_FN_ATTRS 568 _mm256_andnot_pd(__m256d __a, __m256d __b) 569 { 570 return (__m256d)(~(__v4du)__a & (__v4du)__b); 571 } 572 573 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using 574 /// the one's complement of the values contained in the first source operand. 575 /// 576 /// \headerfile <x86intrin.h> 577 /// 578 /// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction. 579 /// 580 /// \param __a 581 /// A 256-bit vector of [8 x float] containing the left source operand. The 582 /// one's complement of this value is used in the bitwise AND. 583 /// \param __b 584 /// A 256-bit vector of [8 x float] containing the right source operand. 585 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 586 /// values of the second operand and the one's complement of the first 587 /// operand. 588 static __inline __m256 __DEFAULT_FN_ATTRS 589 _mm256_andnot_ps(__m256 __a, __m256 __b) 590 { 591 return (__m256)(~(__v8su)__a & (__v8su)__b); 592 } 593 594 /// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double]. 595 /// 596 /// \headerfile <x86intrin.h> 597 /// 598 /// This intrinsic corresponds to the \c VORPD / ORPD instruction. 599 /// 600 /// \param __a 601 /// A 256-bit vector of [4 x double] containing one of the source operands. 602 /// \param __b 603 /// A 256-bit vector of [4 x double] containing one of the source operands. 604 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 605 /// values between both operands. 606 static __inline __m256d __DEFAULT_FN_ATTRS 607 _mm256_or_pd(__m256d __a, __m256d __b) 608 { 609 return (__m256d)((__v4du)__a | (__v4du)__b); 610 } 611 612 /// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float]. 613 /// 614 /// \headerfile <x86intrin.h> 615 /// 616 /// This intrinsic corresponds to the \c VORPS / ORPS instruction. 617 /// 618 /// \param __a 619 /// A 256-bit vector of [8 x float] containing one of the source operands. 620 /// \param __b 621 /// A 256-bit vector of [8 x float] containing one of the source operands. 622 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 623 /// values between both operands. 624 static __inline __m256 __DEFAULT_FN_ATTRS 625 _mm256_or_ps(__m256 __a, __m256 __b) 626 { 627 return (__m256)((__v8su)__a | (__v8su)__b); 628 } 629 630 /// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 631 /// 632 /// \headerfile <x86intrin.h> 633 /// 634 /// This intrinsic corresponds to the \c VXORPD / XORPD instruction. 635 /// 636 /// \param __a 637 /// A 256-bit vector of [4 x double] containing one of the source operands. 638 /// \param __b 639 /// A 256-bit vector of [4 x double] containing one of the source operands. 640 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 641 /// values between both operands. 642 static __inline __m256d __DEFAULT_FN_ATTRS 643 _mm256_xor_pd(__m256d __a, __m256d __b) 644 { 645 return (__m256d)((__v4du)__a ^ (__v4du)__b); 646 } 647 648 /// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 649 /// 650 /// \headerfile <x86intrin.h> 651 /// 652 /// This intrinsic corresponds to the \c VXORPS / XORPS instruction. 653 /// 654 /// \param __a 655 /// A 256-bit vector of [8 x float] containing one of the source operands. 656 /// \param __b 657 /// A 256-bit vector of [8 x float] containing one of the source operands. 658 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 659 /// values between both operands. 660 static __inline __m256 __DEFAULT_FN_ATTRS 661 _mm256_xor_ps(__m256 __a, __m256 __b) 662 { 663 return (__m256)((__v8su)__a ^ (__v8su)__b); 664 } 665 666 /* Horizontal arithmetic */ 667 /// \brief Horizontally adds the adjacent pairs of values contained in two 668 /// 256-bit vectors of [4 x double]. 669 /// 670 /// \headerfile <x86intrin.h> 671 /// 672 /// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction. 673 /// 674 /// \param __a 675 /// A 256-bit vector of [4 x double] containing one of the source operands. 676 /// The horizontal sums of the values are returned in the even-indexed 677 /// elements of a vector of [4 x double]. 678 /// \param __b 679 /// A 256-bit vector of [4 x double] containing one of the source operands. 680 /// The horizontal sums of the values are returned in the odd-indexed 681 /// elements of a vector of [4 x double]. 682 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 683 /// both operands. 684 static __inline __m256d __DEFAULT_FN_ATTRS 685 _mm256_hadd_pd(__m256d __a, __m256d __b) 686 { 687 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 688 } 689 690 /// \brief Horizontally adds the adjacent pairs of values contained in two 691 /// 256-bit vectors of [8 x float]. 692 /// 693 /// \headerfile <x86intrin.h> 694 /// 695 /// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction. 696 /// 697 /// \param __a 698 /// A 256-bit vector of [8 x float] containing one of the source operands. 699 /// The horizontal sums of the values are returned in the elements with 700 /// index 0, 1, 4, 5 of a vector of [8 x float]. 701 /// \param __b 702 /// A 256-bit vector of [8 x float] containing one of the source operands. 703 /// The horizontal sums of the values are returned in the elements with 704 /// index 2, 3, 6, 7 of a vector of [8 x float]. 705 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 706 /// both operands. 707 static __inline __m256 __DEFAULT_FN_ATTRS 708 _mm256_hadd_ps(__m256 __a, __m256 __b) 709 { 710 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 711 } 712 713 /// \brief Horizontally subtracts the adjacent pairs of values contained in two 714 /// 256-bit vectors of [4 x double]. 715 /// 716 /// \headerfile <x86intrin.h> 717 /// 718 /// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction. 719 /// 720 /// \param __a 721 /// A 256-bit vector of [4 x double] containing one of the source operands. 722 /// The horizontal differences between the values are returned in the 723 /// even-indexed elements of a vector of [4 x double]. 724 /// \param __b 725 /// A 256-bit vector of [4 x double] containing one of the source operands. 726 /// The horizontal differences between the values are returned in the 727 /// odd-indexed elements of a vector of [4 x double]. 728 /// \returns A 256-bit vector of [4 x double] containing the horizontal 729 /// differences of both operands. 730 static __inline __m256d __DEFAULT_FN_ATTRS 731 _mm256_hsub_pd(__m256d __a, __m256d __b) 732 { 733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 734 } 735 736 /// \brief Horizontally subtracts the adjacent pairs of values contained in two 737 /// 256-bit vectors of [8 x float]. 738 /// 739 /// \headerfile <x86intrin.h> 740 /// 741 /// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction. 742 /// 743 /// \param __a 744 /// A 256-bit vector of [8 x float] containing one of the source operands. 745 /// The horizontal differences between the values are returned in the 746 /// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 747 /// \param __b 748 /// A 256-bit vector of [8 x float] containing one of the source operands. 749 /// The horizontal differences between the values are returned in the 750 /// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 751 /// \returns A 256-bit vector of [8 x float] containing the horizontal 752 /// differences of both operands. 753 static __inline __m256 __DEFAULT_FN_ATTRS 754 _mm256_hsub_ps(__m256 __a, __m256 __b) 755 { 756 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 757 } 758 759 /* Vector permutations */ 760 /// \brief Copies the values in a 128-bit vector of [2 x double] as specified 761 /// by the 128-bit integer vector operand. 762 /// 763 /// \headerfile <x86intrin.h> 764 /// 765 /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. 766 /// 767 /// \param __a 768 /// A 128-bit vector of [2 x double]. 769 /// \param __c 770 /// A 128-bit integer vector operand specifying how the values are to be 771 /// copied. 772 /// Bit [1]: 773 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the 774 /// returned vector. 775 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 776 /// returned vector. 777 /// Bit [65]: 778 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 779 /// returned vector. 780 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 781 /// returned vector. 782 /// \returns A 128-bit vector of [2 x double] containing the copied values. 783 static __inline __m128d __DEFAULT_FN_ATTRS 784 _mm_permutevar_pd(__m128d __a, __m128i __c) 785 { 786 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 787 } 788 789 /// \brief Copies the values in a 256-bit vector of [4 x double] as 790 /// specified by the 256-bit integer vector operand. 791 /// 792 /// \headerfile <x86intrin.h> 793 /// 794 /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. 795 /// 796 /// \param __a 797 /// A 256-bit vector of [4 x double]. 798 /// \param __c 799 /// A 256-bit integer vector operand specifying how the values are to be 800 /// copied. 801 /// Bit [1]: 802 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the 803 /// returned vector. 804 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 805 /// returned vector. 806 /// Bit [65]: 807 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 808 /// returned vector. 809 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 810 /// returned vector. 811 /// Bit [129]: 812 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 813 /// returned vector. 814 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 815 /// returned vector. 816 /// Bit [193]: 817 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 818 /// returned vector. 819 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 820 /// returned vector. 821 /// \returns A 256-bit vector of [4 x double] containing the copied values. 822 static __inline __m256d __DEFAULT_FN_ATTRS 823 _mm256_permutevar_pd(__m256d __a, __m256i __c) 824 { 825 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 826 } 827 828 /// \brief Copies the values stored in a 128-bit vector of [4 x float] as 829 /// specified by the 128-bit integer vector operand. 830 /// 831 /// \headerfile <x86intrin.h> 832 /// 833 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. 834 /// 835 /// \param __a 836 /// A 128-bit vector of [4 x float]. 837 /// \param __c 838 /// A 128-bit integer vector operand specifying how the values are to be 839 /// copied. 840 /// Bits [1:0]: 841 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 842 /// returned vector. 843 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 844 /// returned vector. 845 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 846 /// returned vector. 847 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 848 /// returned vector. 849 /// Bits [33:32]: 850 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 851 /// returned vector. 852 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 853 /// returned vector. 854 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 855 /// returned vector. 856 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 857 /// returned vector. 858 /// Bits [65:64]: 859 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 860 /// returned vector. 861 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 862 /// returned vector. 863 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 864 /// returned vector. 865 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 866 /// returned vector. 867 /// Bits [97:96]: 868 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 869 /// returned vector. 870 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 871 /// returned vector. 872 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 873 /// returned vector. 874 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 875 /// returned vector. 876 /// \returns A 128-bit vector of [4 x float] containing the copied values. 877 static __inline __m128 __DEFAULT_FN_ATTRS 878 _mm_permutevar_ps(__m128 __a, __m128i __c) 879 { 880 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 881 } 882 883 /// \brief Copies the values stored in a 256-bit vector of [8 x float] as 884 /// specified by the 256-bit integer vector operand. 885 /// 886 /// \headerfile <x86intrin.h> 887 /// 888 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. 889 /// 890 /// \param __a 891 /// A 256-bit vector of [8 x float]. 892 /// \param __c 893 /// A 256-bit integer vector operand specifying how the values are to be 894 /// copied. 895 /// Bits [1:0]: 896 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 897 /// returned vector. 898 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 899 /// returned vector. 900 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 901 /// returned vector. 902 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 903 /// returned vector. 904 /// Bits [33:32]: 905 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 906 /// returned vector. 907 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 908 /// returned vector. 909 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 910 /// returned vector. 911 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 912 /// returned vector. 913 /// Bits [65:64]: 914 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 915 /// returned vector. 916 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 917 /// returned vector. 918 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 919 /// returned vector. 920 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 921 /// returned vector. 922 /// Bits [97:96]: 923 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 924 /// returned vector. 925 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 926 /// returned vector. 927 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 928 /// returned vector. 929 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 930 /// returned vector. 931 /// Bits [129:128]: 932 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 933 /// returned vector. 934 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 935 /// returned vector. 936 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 937 /// returned vector. 938 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 939 /// returned vector. 940 /// Bits [161:160]: 941 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 942 /// returned vector. 943 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 944 /// returned vector. 945 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 946 /// returned vector. 947 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 948 /// returned vector. 949 /// Bits [193:192]: 950 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 951 /// returned vector. 952 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 953 /// returned vector. 954 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 955 /// returned vector. 956 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 957 /// returned vector. 958 /// Bits [225:224]: 959 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 960 /// returned vector. 961 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 962 /// returned vector. 963 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 964 /// returned vector. 965 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 966 /// returned vector. 967 /// \returns A 256-bit vector of [8 x float] containing the copied values. 968 static __inline __m256 __DEFAULT_FN_ATTRS 969 _mm256_permutevar_ps(__m256 __a, __m256i __c) 970 { 971 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 972 } 973 974 /// \brief Copies the values in a 128-bit vector of [2 x double] as 975 /// specified by the immediate integer operand. 976 /// 977 /// \headerfile <x86intrin.h> 978 /// 979 /// \code 980 /// __m128d _mm_permute_pd(__m128d A, const int C); 981 /// \endcode 982 /// 983 /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. 984 /// 985 /// \param A 986 /// A 128-bit vector of [2 x double]. 987 /// \param C 988 /// An immediate integer operand specifying how the values are to be copied. 989 /// Bit [0]: 990 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the 991 /// returned vector. 992 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 993 /// returned vector. 994 /// Bit [1]: 995 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 996 /// returned vector. 997 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 998 /// returned vector. 999 /// \returns A 128-bit vector of [2 x double] containing the copied values. 1000 #define _mm_permute_pd(A, C) __extension__ ({ \ 1001 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ 1002 (__v2df)_mm_undefined_pd(), \ 1003 ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); }) 1004 1005 /// \brief Copies the values in a 256-bit vector of [4 x double] as 1006 /// specified by the immediate integer operand. 1007 /// 1008 /// \headerfile <x86intrin.h> 1009 /// 1010 /// \code 1011 /// __m256d _mm256_permute_pd(__m256d A, const int C); 1012 /// \endcode 1013 /// 1014 /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. 1015 /// 1016 /// \param A 1017 /// A 256-bit vector of [4 x double]. 1018 /// \param C 1019 /// An immediate integer operand specifying how the values are to be copied. 1020 /// Bit [0]: 1021 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the 1022 /// returned vector. 1023 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1024 /// returned vector. 1025 /// Bit [1]: 1026 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1027 /// returned vector. 1028 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1029 /// returned vector. 1030 /// Bit [2]: 1031 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1032 /// returned vector. 1033 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1034 /// returned vector. 1035 /// Bit [3]: 1036 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1037 /// returned vector. 1038 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1039 /// returned vector. 1040 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1041 #define _mm256_permute_pd(A, C) __extension__ ({ \ 1042 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ 1043 (__v4df)_mm256_undefined_pd(), \ 1044 0 + (((C) >> 0) & 0x1), \ 1045 0 + (((C) >> 1) & 0x1), \ 1046 2 + (((C) >> 2) & 0x1), \ 1047 2 + (((C) >> 3) & 0x1)); }) 1048 1049 /// \brief Copies the values in a 128-bit vector of [4 x float] as 1050 /// specified by the immediate integer operand. 1051 /// 1052 /// \headerfile <x86intrin.h> 1053 /// 1054 /// \code 1055 /// __m128 _mm_permute_ps(__m128 A, const int C); 1056 /// \endcode 1057 /// 1058 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. 1059 /// 1060 /// \param A 1061 /// A 128-bit vector of [4 x float]. 1062 /// \param C 1063 /// An immediate integer operand specifying how the values are to be copied. 1064 /// Bits [1:0]: 1065 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1066 /// returned vector. 1067 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1068 /// returned vector. 1069 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1070 /// returned vector. 1071 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1072 /// returned vector. 1073 /// Bits [3:2]: 1074 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1075 /// returned vector. 1076 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1077 /// returned vector. 1078 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1079 /// returned vector. 1080 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1081 /// returned vector. 1082 /// Bits [5:4]: 1083 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1084 /// returned vector. 1085 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1086 /// returned vector. 1087 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1088 /// returned vector. 1089 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1090 /// returned vector. 1091 /// Bits [7:6]: 1092 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1093 /// returned vector. 1094 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1095 /// returned vector. 1096 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1097 /// returned vector. 1098 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1099 /// returned vector. 1100 /// \returns A 128-bit vector of [4 x float] containing the copied values. 1101 #define _mm_permute_ps(A, C) __extension__ ({ \ 1102 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ 1103 (__v4sf)_mm_undefined_ps(), \ 1104 ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ 1105 ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) 1106 1107 /// \brief Copies the values in a 256-bit vector of [8 x float] as 1108 /// specified by the immediate integer operand. 1109 /// 1110 /// \headerfile <x86intrin.h> 1111 /// 1112 /// \code 1113 /// __m256 _mm256_permute_ps(__m256 A, const int C); 1114 /// \endcode 1115 /// 1116 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. 1117 /// 1118 /// \param A 1119 /// A 256-bit vector of [8 x float]. 1120 /// \param C 1121 /// An immediate integer operand specifying how the values are to be copied. 1122 /// Bits [1:0]: 1123 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1124 /// returned vector. 1125 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1126 /// returned vector. 1127 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1128 /// returned vector. 1129 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1130 /// returned vector. 1131 /// Bits [3:2]: 1132 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1133 /// returned vector. 1134 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1135 /// returned vector. 1136 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1137 /// returned vector. 1138 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1139 /// returned vector. 1140 /// Bits [5:4]: 1141 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1142 /// returned vector. 1143 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1144 /// returned vector. 1145 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1146 /// returned vector. 1147 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1148 /// returned vector. 1149 /// Bits [7:6]: 1150 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1151 /// returned vector. 1152 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1153 /// returned vector. 1154 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1155 /// returned vector. 1156 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1157 /// returned vector. 1158 /// Bits [1:0]: 1159 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1160 /// returned vector. 1161 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1162 /// returned vector. 1163 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1164 /// returned vector. 1165 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1166 /// returned vector. 1167 /// Bits [3:2]: 1168 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1169 /// returned vector. 1170 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1171 /// returned vector. 1172 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1173 /// returned vector. 1174 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1175 /// returned vector. 1176 /// Bits [5:4]: 1177 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1178 /// returned vector. 1179 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1180 /// returned vector. 1181 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1182 /// returned vector. 1183 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1184 /// returned vector. 1185 /// Bits [7:6]: 1186 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1187 /// returned vector. 1188 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1189 /// returned vector. 1190 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1191 /// returned vector. 1192 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1193 /// returned vector. 1194 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1195 #define _mm256_permute_ps(A, C) __extension__ ({ \ 1196 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ 1197 (__v8sf)_mm256_undefined_ps(), \ 1198 0 + (((C) >> 0) & 0x3), \ 1199 0 + (((C) >> 2) & 0x3), \ 1200 0 + (((C) >> 4) & 0x3), \ 1201 0 + (((C) >> 6) & 0x3), \ 1202 4 + (((C) >> 0) & 0x3), \ 1203 4 + (((C) >> 2) & 0x3), \ 1204 4 + (((C) >> 4) & 0x3), \ 1205 4 + (((C) >> 6) & 0x3)); }) 1206 1207 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1208 /// [4 x double], as specified by the immediate integer operand. 1209 /// 1210 /// \headerfile <x86intrin.h> 1211 /// 1212 /// \code 1213 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); 1214 /// \endcode 1215 /// 1216 /// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction. 1217 /// 1218 /// \param V1 1219 /// A 256-bit vector of [4 x double]. 1220 /// \param V2 1221 /// A 256-bit vector of [4 x double. 1222 /// \param M 1223 /// An immediate integer operand specifying how the values are to be 1224 /// permuted. 1225 /// Bits [1:0]: 1226 /// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the 1227 /// destination. 1228 /// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the 1229 /// destination. 1230 /// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the 1231 /// destination. 1232 /// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the 1233 /// destination. 1234 /// Bits [5:4]: 1235 /// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the 1236 /// destination. 1237 /// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the 1238 /// destination. 1239 /// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the 1240 /// destination. 1241 /// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the 1242 /// destination. 1243 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1244 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 1245 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1246 (__v4df)(__m256d)(V2), (M)); }) 1247 1248 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1249 /// [8 x float], as specified by the immediate integer operand. 1250 /// 1251 /// \headerfile <x86intrin.h> 1252 /// 1253 /// \code 1254 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); 1255 /// \endcode 1256 /// 1257 /// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction. 1258 /// 1259 /// \param V1 1260 /// A 256-bit vector of [8 x float]. 1261 /// \param V2 1262 /// A 256-bit vector of [8 x float]. 1263 /// \param M 1264 /// An immediate integer operand specifying how the values are to be 1265 /// permuted. 1266 /// Bits [1:0]: 1267 /// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the 1268 /// destination. 1269 /// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the 1270 /// destination. 1271 /// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the 1272 /// destination. 1273 /// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the 1274 /// destination. 1275 /// Bits [5:4]: 1276 /// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the 1277 /// destination. 1278 /// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the 1279 /// destination. 1280 /// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the 1281 /// destination. 1282 /// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the 1283 /// destination. 1284 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1285 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 1286 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1287 (__v8sf)(__m256)(V2), (M)); }) 1288 1289 /// \brief Permutes 128-bit data values stored in two 256-bit integer vectors, 1290 /// as specified by the immediate integer operand. 1291 /// 1292 /// \headerfile <x86intrin.h> 1293 /// 1294 /// \code 1295 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); 1296 /// \endcode 1297 /// 1298 /// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction. 1299 /// 1300 /// \param V1 1301 /// A 256-bit integer vector. 1302 /// \param V2 1303 /// A 256-bit integer vector. 1304 /// \param M 1305 /// An immediate integer operand specifying how the values are to be copied. 1306 /// Bits [1:0]: 1307 /// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the 1308 /// destination. 1309 /// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the 1310 /// destination. 1311 /// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the 1312 /// destination. 1313 /// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the 1314 /// destination. 1315 /// Bits [5:4]: 1316 /// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the 1317 /// destination. 1318 /// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the 1319 /// destination. 1320 /// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the 1321 /// destination. 1322 /// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the 1323 /// destination. 1324 /// \returns A 256-bit integer vector containing the copied values. 1325 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 1326 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1327 (__v8si)(__m256i)(V2), (M)); }) 1328 1329 /* Vector Blend */ 1330 /// \brief Merges 64-bit double-precision data values stored in either of the 1331 /// two 256-bit vectors of [4 x double], as specified by the immediate 1332 /// integer operand. 1333 /// 1334 /// \headerfile <x86intrin.h> 1335 /// 1336 /// \code 1337 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); 1338 /// \endcode 1339 /// 1340 /// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction. 1341 /// 1342 /// \param V1 1343 /// A 256-bit vector of [4 x double]. 1344 /// \param V2 1345 /// A 256-bit vector of [4 x double]. 1346 /// \param M 1347 /// An immediate integer operand, with mask bits [3:0] specifying how the 1348 /// values are to be copied. The position of the mask bit corresponds to the 1349 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 1350 /// element in operand V1 is copied to the same position in the destination. 1351 /// When a mask bit is 1, the corresponding 64-bit element in operand V2 is 1352 /// copied to the same position in the destination. 1353 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1354 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 1355 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ 1356 (__v4df)(__m256d)(V2), \ 1357 (((M) & 0x01) ? 4 : 0), \ 1358 (((M) & 0x02) ? 5 : 1), \ 1359 (((M) & 0x04) ? 6 : 2), \ 1360 (((M) & 0x08) ? 7 : 3)); }) 1361 1362 /// \brief Merges 32-bit single-precision data values stored in either of the 1363 /// two 256-bit vectors of [8 x float], as specified by the immediate 1364 /// integer operand. 1365 /// 1366 /// \headerfile <x86intrin.h> 1367 /// 1368 /// \code 1369 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); 1370 /// \endcode 1371 /// 1372 /// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction. 1373 /// 1374 /// \param V1 1375 /// A 256-bit vector of [8 x float]. 1376 /// \param V2 1377 /// A 256-bit vector of [8 x float]. 1378 /// \param M 1379 /// An immediate integer operand, with mask bits [7:0] specifying how the 1380 /// values are to be copied. The position of the mask bit corresponds to the 1381 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 1382 /// element in operand V1 is copied to the same position in the destination. 1383 /// When a mask bit is 1, the corresponding 32-bit element in operand V2 is 1384 /// copied to the same position in the destination. 1385 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1386 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 1387 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ 1388 (__v8sf)(__m256)(V2), \ 1389 (((M) & 0x01) ? 8 : 0), \ 1390 (((M) & 0x02) ? 9 : 1), \ 1391 (((M) & 0x04) ? 10 : 2), \ 1392 (((M) & 0x08) ? 11 : 3), \ 1393 (((M) & 0x10) ? 12 : 4), \ 1394 (((M) & 0x20) ? 13 : 5), \ 1395 (((M) & 0x40) ? 14 : 6), \ 1396 (((M) & 0x80) ? 15 : 7)); }) 1397 1398 /// \brief Merges 64-bit double-precision data values stored in either of the 1399 /// two 256-bit vectors of [4 x double], as specified by the 256-bit vector 1400 /// operand. 1401 /// 1402 /// \headerfile <x86intrin.h> 1403 /// 1404 /// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction. 1405 /// 1406 /// \param __a 1407 /// A 256-bit vector of [4 x double]. 1408 /// \param __b 1409 /// A 256-bit vector of [4 x double]. 1410 /// \param __c 1411 /// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying 1412 /// how the values are to be copied. The position of the mask bit corresponds 1413 /// to the most significant bit of a copied value. When a mask bit is 0, the 1414 /// corresponding 64-bit element in operand __a is copied to the same 1415 /// position in the destination. When a mask bit is 1, the corresponding 1416 /// 64-bit element in operand __b is copied to the same position in the 1417 /// destination. 1418 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1419 static __inline __m256d __DEFAULT_FN_ATTRS 1420 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1421 { 1422 return (__m256d)__builtin_ia32_blendvpd256( 1423 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1424 } 1425 1426 /// \brief Merges 32-bit single-precision data values stored in either of the 1427 /// two 256-bit vectors of [8 x float], as specified by the 256-bit vector 1428 /// operand. 1429 /// 1430 /// \headerfile <x86intrin.h> 1431 /// 1432 /// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction. 1433 /// 1434 /// \param __a 1435 /// A 256-bit vector of [8 x float]. 1436 /// \param __b 1437 /// A 256-bit vector of [8 x float]. 1438 /// \param __c 1439 /// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, 1440 /// and 31 specifying how the values are to be copied. The position of the 1441 /// mask bit corresponds to the most significant bit of a copied value. When 1442 /// a mask bit is 0, the corresponding 32-bit element in operand __a is 1443 /// copied to the same position in the destination. When a mask bit is 1, the 1444 /// corresponding 32-bit element in operand __b is copied to the same 1445 /// position in the destination. 1446 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1447 static __inline __m256 __DEFAULT_FN_ATTRS 1448 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1449 { 1450 return (__m256)__builtin_ia32_blendvps256( 1451 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1452 } 1453 1454 /* Vector Dot Product */ 1455 /// \brief Computes two dot products in parallel, using the lower and upper 1456 /// halves of two [8 x float] vectors as input to the two computations, and 1457 /// returning the two dot products in the lower and upper halves of the 1458 /// [8 x float] result. The immediate integer operand controls which 1459 /// input elements will contribute to the dot product, and where the final 1460 /// results are returned. In general, for each dot product, the four 1461 /// corresponding elements of the input vectors are multiplied; the first 1462 /// two and second two products are summed, then the two sums are added to 1463 /// form the final result. 1464 /// 1465 /// \headerfile <x86intrin.h> 1466 /// 1467 /// \code 1468 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); 1469 /// \endcode 1470 /// 1471 /// This intrinsic corresponds to the \c VDPPS / DPPS instruction. 1472 /// 1473 /// \param V1 1474 /// A vector of [8 x float] values, treated as two [4 x float] vectors. 1475 /// \param V2 1476 /// A vector of [8 x float] values, treated as two [4 x float] vectors. 1477 /// \param M 1478 /// An immediate integer argument. Bits [7:4] determine which elements of 1479 /// the input vectors are used, with bit [4] corresponding to the lowest 1480 /// element and bit [7] corresponding to the highest element of each [4 x 1481 /// float] subvector. If a bit is set, the corresponding elements from the 1482 /// two input vectors are used as an input for dot product; otherwise that 1483 /// input is treated as zero. Bits [3:0] determine which elements of the 1484 /// result will receive a copy of the final dot product, with bit [0] 1485 /// corresponding to the lowest element and bit [3] corresponding to the 1486 /// highest element of each [4 x float] subvector. If a bit is set, the dot 1487 /// product is returned in the corresponding element; otherwise that element 1488 /// is set to zero. The bitmask is applied in the same way to each of the 1489 /// two parallel dot product computations. 1490 /// \returns A 256-bit vector of [8 x float] containing the two dot products. 1491 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 1492 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1493 (__v8sf)(__m256)(V2), (M)); }) 1494 1495 /* Vector shuffle */ 1496 /// \brief Selects 8 float values from the 256-bit operands of [8 x float], as 1497 /// specified by the immediate value operand. The four selected elements in 1498 /// each operand are copied to the destination according to the bits 1499 /// specified in the immediate operand. The selected elements from the first 1500 /// 256-bit operand are copied to bits [63:0] and bits [191:128] of the 1501 /// destination, and the selected elements from the second 256-bit operand 1502 /// are copied to bits [127:64] and bits [255:192] of the destination. For 1503 /// example, if bits [7:0] of the immediate operand contain a value of 0xFF, 1504 /// the 256-bit destination vector would contain the following values: b[7], 1505 /// b[7], a[7], a[7], b[3], b[3], a[3], a[3]. 1506 /// 1507 /// \headerfile <x86intrin.h> 1508 /// 1509 /// \code 1510 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); 1511 /// \endcode 1512 /// 1513 /// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction. 1514 /// 1515 /// \param a 1516 /// A 256-bit vector of [8 x float]. The four selected elements in this 1517 /// operand are copied to bits [63:0] and bits [191:128] in the destination, 1518 /// according to the bits specified in the immediate operand. 1519 /// \param b 1520 /// A 256-bit vector of [8 x float]. The four selected elements in this 1521 /// operand are copied to bits [127:64] and bits [255:192] in the 1522 /// destination, according to the bits specified in the immediate operand. 1523 /// \param mask 1524 /// An immediate value containing an 8-bit value specifying which elements to 1525 /// copy from a and b. Bits [3:0] specify the values copied from operand a. 1526 /// Bits [7:4] specify the values copied from operand b. 1527 /// The destinations within the 256-bit destination are assigned values as 1528 /// follows, according to the bit value assignments described below: 1529 /// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the 1530 /// destination. 1531 /// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the 1532 /// destination. 1533 /// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the 1534 /// destination. 1535 /// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in 1536 /// the destination. 1537 /// Bit value assignments: 1538 /// 00: Bits [31:0] and [159:128] are copied from the selected operand. 1539 /// 01: Bits [63:32] and [191:160] are copied from the selected operand. 1540 /// 10: Bits [95:64] and [223:192] are copied from the selected operand. 1541 /// 11: Bits [127:96] and [255:224] are copied from the selected operand. 1542 /// \returns A 256-bit vector of [8 x float] containing the shuffled values. 1543 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 1544 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ 1545 (__v8sf)(__m256)(b), \ 1546 0 + (((mask) >> 0) & 0x3), \ 1547 0 + (((mask) >> 2) & 0x3), \ 1548 8 + (((mask) >> 4) & 0x3), \ 1549 8 + (((mask) >> 6) & 0x3), \ 1550 4 + (((mask) >> 0) & 0x3), \ 1551 4 + (((mask) >> 2) & 0x3), \ 1552 12 + (((mask) >> 4) & 0x3), \ 1553 12 + (((mask) >> 6) & 0x3)); }) 1554 1555 /// \brief Selects four double-precision values from the 256-bit operands of 1556 /// [4 x double], as specified by the immediate value operand. The selected 1557 /// elements from the first 256-bit operand are copied to bits [63:0] and 1558 /// bits [191:128] in the destination, and the selected elements from the 1559 /// second 256-bit operand are copied to bits [127:64] and bits [255:192] in 1560 /// the destination. For example, if bits [3:0] of the immediate operand 1561 /// contain a value of 0xF, the 256-bit destination vector would contain the 1562 /// following values: b[3], a[3], b[1], a[1]. 1563 /// 1564 /// \headerfile <x86intrin.h> 1565 /// 1566 /// \code 1567 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); 1568 /// \endcode 1569 /// 1570 /// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction. 1571 /// 1572 /// \param a 1573 /// A 256-bit vector of [4 x double]. 1574 /// \param b 1575 /// A 256-bit vector of [4 x double]. 1576 /// \param mask 1577 /// An immediate value containing 8-bit values specifying which elements to 1578 /// copy from a and b: 1579 /// Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the 1580 /// destination. 1581 /// Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the 1582 /// destination. 1583 /// Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the 1584 /// destination. 1585 /// Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the 1586 /// destination. 1587 /// Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the 1588 /// destination. 1589 /// Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the 1590 /// destination. 1591 /// Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the 1592 /// destination. 1593 /// Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the 1594 /// destination. 1595 /// \returns A 256-bit vector of [4 x double] containing the shuffled values. 1596 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 1597 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ 1598 (__v4df)(__m256d)(b), \ 1599 0 + (((mask) >> 0) & 0x1), \ 1600 4 + (((mask) >> 1) & 0x1), \ 1601 2 + (((mask) >> 2) & 0x1), \ 1602 6 + (((mask) >> 3) & 0x1)); }) 1603 1604 /* Compare */ 1605 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 1606 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 1607 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 1608 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 1609 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 1610 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 1611 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 1612 #define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 1613 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1614 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 1615 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1616 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1617 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1618 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1619 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1620 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1621 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1622 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1623 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1624 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1625 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1626 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1627 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 1628 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1629 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1630 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 1631 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1632 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1633 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1634 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1635 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1636 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1637 1638 /// \brief Compares each of the corresponding double-precision values of two 1639 /// 128-bit vectors of [2 x double], using the operation specified by the 1640 /// immediate integer operand. Returns a [2 x double] vector consisting of 1641 /// two doubles corresponding to the two comparison results: zero if the 1642 /// comparison is false, and all 1's if the comparison is true. 1643 /// 1644 /// \headerfile <x86intrin.h> 1645 /// 1646 /// \code 1647 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 1648 /// \endcode 1649 /// 1650 /// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction. 1651 /// 1652 /// \param a 1653 /// A 128-bit vector of [2 x double]. 1654 /// \param b 1655 /// A 128-bit vector of [2 x double]. 1656 /// \param c 1657 /// An immediate integer operand, with bits [4:0] specifying which comparison 1658 /// operation to use: 1659 /// 00h, 08h, 10h, 18h: Equal 1660 /// 01h, 09h, 11h, 19h: Less than 1661 /// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped 1662 /// operands) 1663 /// 03h, 0Bh, 13h, 1Bh: Unordered 1664 /// 04h, 0Ch, 14h, 1Ch: Not equal 1665 /// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) 1666 /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1667 /// (swapped operands) 1668 /// 07h, 0Fh, 17h, 1Fh: Ordered 1669 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 1670 #define _mm_cmp_pd(a, b, c) __extension__ ({ \ 1671 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ 1672 (__v2df)(__m128d)(b), (c)); }) 1673 1674 /// \brief Compares each of the corresponding values of two 128-bit vectors of 1675 /// [4 x float], using the operation specified by the immediate integer 1676 /// operand. Returns a [4 x float] vector consisting of four floats 1677 /// corresponding to the four comparison results: zero if the comparison is 1678 /// false, and all 1's if the comparison is true. 1679 /// 1680 /// \headerfile <x86intrin.h> 1681 /// 1682 /// \code 1683 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); 1684 /// \endcode 1685 /// 1686 /// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction. 1687 /// 1688 /// \param a 1689 /// A 128-bit vector of [4 x float]. 1690 /// \param b 1691 /// A 128-bit vector of [4 x float]. 1692 /// \param c 1693 /// An immediate integer operand, with bits [4:0] specifying which comparison 1694 /// operation to use: 1695 /// 00h, 08h, 10h, 18h: Equal 1696 /// 01h, 09h, 11h, 19h: Less than 1697 /// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped 1698 /// operands) 1699 /// 03h, 0Bh, 13h, 1Bh: Unordered 1700 /// 04h, 0Ch, 14h, 1Ch: Not equal 1701 /// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) 1702 /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1703 /// (swapped operands) 1704 /// 07h, 0Fh, 17h, 1Fh: Ordered 1705 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 1706 #define _mm_cmp_ps(a, b, c) __extension__ ({ \ 1707 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ 1708 (__v4sf)(__m128)(b), (c)); }) 1709 1710 /// \brief Compares each of the corresponding double-precision values of two 1711 /// 256-bit vectors of [4 x double], using the operation specified by the 1712 /// immediate integer operand. Returns a [4 x double] vector consisting of 1713 /// four doubles corresponding to the four comparison results: zero if the 1714 /// comparison is false, and all 1's if the comparison is true. 1715 /// 1716 /// \headerfile <x86intrin.h> 1717 /// 1718 /// \code 1719 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); 1720 /// \endcode 1721 /// 1722 /// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction. 1723 /// 1724 /// \param a 1725 /// A 256-bit vector of [4 x double]. 1726 /// \param b 1727 /// A 256-bit vector of [4 x double]. 1728 /// \param c 1729 /// An immediate integer operand, with bits [4:0] specifying which comparison 1730 /// operation to use: 1731 /// 00h, 08h, 10h, 18h: Equal 1732 /// 01h, 09h, 11h, 19h: Less than 1733 /// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped 1734 /// operands) 1735 /// 03h, 0Bh, 13h, 1Bh: Unordered 1736 /// 04h, 0Ch, 14h, 1Ch: Not equal 1737 /// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) 1738 /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1739 /// (swapped operands) 1740 /// 07h, 0Fh, 17h, 1Fh: Ordered 1741 /// \returns A 256-bit vector of [4 x double] containing the comparison results. 1742 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 1743 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1744 (__v4df)(__m256d)(b), (c)); }) 1745 1746 /// \brief Compares each of the corresponding values of two 256-bit vectors of 1747 /// [8 x float], using the operation specified by the immediate integer 1748 /// operand. Returns a [8 x float] vector consisting of eight floats 1749 /// corresponding to the eight comparison results: zero if the comparison is 1750 /// false, and all 1's if the comparison is true. 1751 /// 1752 /// \headerfile <x86intrin.h> 1753 /// 1754 /// \code 1755 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); 1756 /// \endcode 1757 /// 1758 /// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction. 1759 /// 1760 /// \param a 1761 /// A 256-bit vector of [8 x float]. 1762 /// \param b 1763 /// A 256-bit vector of [8 x float]. 1764 /// \param c 1765 /// An immediate integer operand, with bits [4:0] specifying which comparison 1766 /// operation to use: 1767 /// 00h, 08h, 10h, 18h: Equal 1768 /// 01h, 09h, 11h, 19h: Less than 1769 /// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped 1770 /// operands) 1771 /// 03h, 0Bh, 13h, 1Bh: Unordered 1772 /// 04h, 0Ch, 14h, 1Ch: Not equal 1773 /// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) 1774 /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1775 /// (swapped operands) 1776 /// 07h, 0Fh, 17h, 1Fh: Ordered 1777 /// \returns A 256-bit vector of [8 x float] containing the comparison results. 1778 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 1779 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1780 (__v8sf)(__m256)(b), (c)); }) 1781 1782 /// \brief Compares each of the corresponding scalar double-precision values of 1783 /// two 128-bit vectors of [2 x double], using the operation specified by the 1784 /// immediate integer operand. If the result is true, all 64 bits of the 1785 /// destination vector are set; otherwise they are cleared. 1786 /// 1787 /// \headerfile <x86intrin.h> 1788 /// 1789 /// \code 1790 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 1791 /// \endcode 1792 /// 1793 /// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction. 1794 /// 1795 /// \param a 1796 /// A 128-bit vector of [2 x double]. 1797 /// \param b 1798 /// A 128-bit vector of [2 x double]. 1799 /// \param c 1800 /// An immediate integer operand, with bits [4:0] specifying which comparison 1801 /// operation to use: 1802 /// 00h, 08h, 10h, 18h: Equal 1803 /// 01h, 09h, 11h, 19h: Less than 1804 /// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped 1805 /// operands) 1806 /// 03h, 0Bh, 13h, 1Bh: Unordered 1807 /// 04h, 0Ch, 14h, 1Ch: Not equal 1808 /// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) 1809 /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1810 /// (swapped operands) 1811 /// 07h, 0Fh, 17h, 1Fh: Ordered 1812 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 1813 #define _mm_cmp_sd(a, b, c) __extension__ ({ \ 1814 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ 1815 (__v2df)(__m128d)(b), (c)); }) 1816 1817 /// \brief Compares each of the corresponding scalar values of two 128-bit 1818 /// vectors of [4 x float], using the operation specified by the immediate 1819 /// integer operand. If the result is true, all 32 bits of the destination 1820 /// vector are set; otherwise they are cleared. 1821 /// 1822 /// \headerfile <x86intrin.h> 1823 /// 1824 /// \code 1825 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); 1826 /// \endcode 1827 /// 1828 /// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction. 1829 /// 1830 /// \param a 1831 /// A 128-bit vector of [4 x float]. 1832 /// \param b 1833 /// A 128-bit vector of [4 x float]. 1834 /// \param c 1835 /// An immediate integer operand, with bits [4:0] specifying which comparison 1836 /// operation to use: 1837 /// 00h, 08h, 10h, 18h: Equal 1838 /// 01h, 09h, 11h, 19h: Less than 1839 /// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped 1840 /// operands) 1841 /// 03h, 0Bh, 13h, 1Bh: Unordered 1842 /// 04h, 0Ch, 14h, 1Ch: Not equal 1843 /// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) 1844 /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1845 /// (swapped operands) 1846 /// 07h, 0Fh, 17h, 1Fh: Ordered 1847 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 1848 #define _mm_cmp_ss(a, b, c) __extension__ ({ \ 1849 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ 1850 (__v4sf)(__m128)(b), (c)); }) 1851 1852 /// \brief Takes a [8 x i32] vector and returns the vector element value 1853 /// indexed by the immediate constant operand. 1854 /// 1855 /// \headerfile <x86intrin.h> 1856 /// 1857 /// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE / 1858 /// EXTRACTF128+COMPOSITE instruction. 1859 /// 1860 /// \param __a 1861 /// A 256-bit vector of [8 x i32]. 1862 /// \param __imm 1863 /// An immediate integer operand with bits [2:0] determining which vector 1864 /// element is extracted and returned. 1865 /// \returns A 32-bit integer containing the extracted 32 bits of extended 1866 /// packed data. 1867 static __inline int __DEFAULT_FN_ATTRS 1868 _mm256_extract_epi32(__m256i __a, const int __imm) 1869 { 1870 __v8si __b = (__v8si)__a; 1871 return __b[__imm & 7]; 1872 } 1873 1874 /// \brief Takes a [16 x i16] vector and returns the vector element value 1875 /// indexed by the immediate constant operand. 1876 /// 1877 /// \headerfile <x86intrin.h> 1878 /// 1879 /// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE / 1880 /// EXTRACTF128+COMPOSITE instruction. 1881 /// 1882 /// \param __a 1883 /// A 256-bit integer vector of [16 x i16]. 1884 /// \param __imm 1885 /// An immediate integer operand with bits [3:0] determining which vector 1886 /// element is extracted and returned. 1887 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended 1888 /// packed data. 1889 static __inline int __DEFAULT_FN_ATTRS 1890 _mm256_extract_epi16(__m256i __a, const int __imm) 1891 { 1892 __v16hi __b = (__v16hi)__a; 1893 return (unsigned short)__b[__imm & 15]; 1894 } 1895 1896 /// \brief Takes a [32 x i8] vector and returns the vector element value 1897 /// indexed by the immediate constant operand. 1898 /// 1899 /// \headerfile <x86intrin.h> 1900 /// 1901 /// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE / 1902 /// EXTRACTF128+COMPOSITE instruction. 1903 /// 1904 /// \param __a 1905 /// A 256-bit integer vector of [32 x i8]. 1906 /// \param __imm 1907 /// An immediate integer operand with bits [4:0] determining which vector 1908 /// element is extracted and returned. 1909 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended 1910 /// packed data. 1911 static __inline int __DEFAULT_FN_ATTRS 1912 _mm256_extract_epi8(__m256i __a, const int __imm) 1913 { 1914 __v32qi __b = (__v32qi)__a; 1915 return (unsigned char)__b[__imm & 31]; 1916 } 1917 1918 #ifdef __x86_64__ 1919 /// \brief Takes a [4 x i64] vector and returns the vector element value 1920 /// indexed by the immediate constant operand. 1921 /// 1922 /// \headerfile <x86intrin.h> 1923 /// 1924 /// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE / 1925 /// EXTRACTF128+COMPOSITE instruction. 1926 /// 1927 /// \param __a 1928 /// A 256-bit integer vector of [4 x i64]. 1929 /// \param __imm 1930 /// An immediate integer operand with bits [1:0] determining which vector 1931 /// element is extracted and returned. 1932 /// \returns A 64-bit integer containing the extracted 64 bits of extended 1933 /// packed data. 1934 static __inline long long __DEFAULT_FN_ATTRS 1935 _mm256_extract_epi64(__m256i __a, const int __imm) 1936 { 1937 __v4di __b = (__v4di)__a; 1938 return __b[__imm & 3]; 1939 } 1940 #endif 1941 1942 /// \brief Takes a [8 x i32] vector and replaces the vector element value 1943 /// indexed by the immediate constant operand by a new value. Returns the 1944 /// modified vector. 1945 /// 1946 /// \headerfile <x86intrin.h> 1947 /// 1948 /// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE / 1949 /// INSERTF128+COMPOSITE instruction. 1950 /// 1951 /// \param __a 1952 /// A vector of [8 x i32] to be used by the insert operation. 1953 /// \param __b 1954 /// An integer value. The replacement value for the insert operation. 1955 /// \param __imm 1956 /// An immediate integer specifying the index of the vector element to be 1957 /// replaced. 1958 /// \returns A copy of vector __a, after replacing its element indexed by __imm 1959 /// with __b. 1960 static __inline __m256i __DEFAULT_FN_ATTRS 1961 _mm256_insert_epi32(__m256i __a, int __b, int const __imm) 1962 { 1963 __v8si __c = (__v8si)__a; 1964 __c[__imm & 7] = __b; 1965 return (__m256i)__c; 1966 } 1967 1968 1969 /// \brief Takes a [16 x i16] vector and replaces the vector element value 1970 /// indexed by the immediate constant operand with a new value. Returns the 1971 /// modified vector. 1972 /// 1973 /// \headerfile <x86intrin.h> 1974 /// 1975 /// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE / 1976 /// INSERTF128+COMPOSITE instruction. 1977 /// 1978 /// \param __a 1979 /// A vector of [16 x i16] to be used by the insert operation. 1980 /// \param __b 1981 /// An i16 integer value. The replacement value for the insert operation. 1982 /// \param __imm 1983 /// An immediate integer specifying the index of the vector element to be 1984 /// replaced. 1985 /// \returns A copy of vector __a, after replacing its element indexed by __imm 1986 /// with __b. 1987 static __inline __m256i __DEFAULT_FN_ATTRS 1988 _mm256_insert_epi16(__m256i __a, int __b, int const __imm) 1989 { 1990 __v16hi __c = (__v16hi)__a; 1991 __c[__imm & 15] = __b; 1992 return (__m256i)__c; 1993 } 1994 1995 /// \brief Takes a [32 x i8] vector and replaces the vector element value 1996 /// indexed by the immediate constant operand with a new value. Returns the 1997 /// modified vector. 1998 /// 1999 /// \headerfile <x86intrin.h> 2000 /// 2001 /// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE / 2002 /// INSERTF128+COMPOSITE instruction. 2003 /// 2004 /// \param __a 2005 /// A vector of [32 x i8] to be used by the insert operation. 2006 /// \param __b 2007 /// An i8 integer value. The replacement value for the insert operation. 2008 /// \param __imm 2009 /// An immediate integer specifying the index of the vector element to be 2010 /// replaced. 2011 /// \returns A copy of vector __a, after replacing its element indexed by __imm 2012 /// with __b. 2013 static __inline __m256i __DEFAULT_FN_ATTRS 2014 _mm256_insert_epi8(__m256i __a, int __b, int const __imm) 2015 { 2016 __v32qi __c = (__v32qi)__a; 2017 __c[__imm & 31] = __b; 2018 return (__m256i)__c; 2019 } 2020 2021 #ifdef __x86_64__ 2022 /// \brief Takes a [4 x i64] vector and replaces the vector element value 2023 /// indexed by the immediate constant operand with a new value. Returns the 2024 /// modified vector. 2025 /// 2026 /// \headerfile <x86intrin.h> 2027 /// 2028 /// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE / 2029 /// INSERTF128+COMPOSITE instruction. 2030 /// 2031 /// \param __a 2032 /// A vector of [4 x i64] to be used by the insert operation. 2033 /// \param __b 2034 /// A 64-bit integer value. The replacement value for the insert operation. 2035 /// \param __imm 2036 /// An immediate integer specifying the index of the vector element to be 2037 /// replaced. 2038 /// \returns A copy of vector __a, after replacing its element indexed by __imm 2039 /// with __b. 2040 static __inline __m256i __DEFAULT_FN_ATTRS 2041 _mm256_insert_epi64(__m256i __a, long long __b, int const __imm) 2042 { 2043 __v4di __c = (__v4di)__a; 2044 __c[__imm & 3] = __b; 2045 return (__m256i)__c; 2046 } 2047 #endif 2048 2049 /* Conversion */ 2050 /// \brief Converts a vector of [4 x i32] into a vector of [4 x double]. 2051 /// 2052 /// \headerfile <x86intrin.h> 2053 /// 2054 /// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction. 2055 /// 2056 /// \param __a 2057 /// A 128-bit integer vector of [4 x i32]. 2058 /// \returns A 256-bit vector of [4 x double] containing the converted values. 2059 static __inline __m256d __DEFAULT_FN_ATTRS 2060 _mm256_cvtepi32_pd(__m128i __a) 2061 { 2062 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); 2063 } 2064 2065 /// \brief Converts a vector of [8 x i32] into a vector of [8 x float]. 2066 /// 2067 /// \headerfile <x86intrin.h> 2068 /// 2069 /// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction. 2070 /// 2071 /// \param __a 2072 /// A 256-bit integer vector. 2073 /// \returns A 256-bit vector of [8 x float] containing the converted values. 2074 static __inline __m256 __DEFAULT_FN_ATTRS 2075 _mm256_cvtepi32_ps(__m256i __a) 2076 { 2077 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); 2078 } 2079 2080 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2081 /// [4 x float]. 2082 /// 2083 /// \headerfile <x86intrin.h> 2084 /// 2085 /// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction. 2086 /// 2087 /// \param __a 2088 /// A 256-bit vector of [4 x double]. 2089 /// \returns A 128-bit vector of [4 x float] containing the converted values. 2090 static __inline __m128 __DEFAULT_FN_ATTRS 2091 _mm256_cvtpd_ps(__m256d __a) 2092 { 2093 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 2094 } 2095 2096 /// \brief Converts a vector of [8 x float] into a vector of [8 x i32]. 2097 /// 2098 /// \headerfile <x86intrin.h> 2099 /// 2100 /// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction. 2101 /// 2102 /// \param __a 2103 /// A 256-bit vector of [8 x float]. 2104 /// \returns A 256-bit integer vector containing the converted values. 2105 static __inline __m256i __DEFAULT_FN_ATTRS 2106 _mm256_cvtps_epi32(__m256 __a) 2107 { 2108 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 2109 } 2110 2111 static __inline __m256d __DEFAULT_FN_ATTRS 2112 _mm256_cvtps_pd(__m128 __a) 2113 { 2114 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); 2115 } 2116 2117 static __inline __m128i __DEFAULT_FN_ATTRS 2118 _mm256_cvttpd_epi32(__m256d __a) 2119 { 2120 return (__m128i)__builtin_convertvector((__v4df) __a, __v4si); 2121 } 2122 2123 static __inline __m128i __DEFAULT_FN_ATTRS 2124 _mm256_cvtpd_epi32(__m256d __a) 2125 { 2126 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 2127 } 2128 2129 static __inline __m256i __DEFAULT_FN_ATTRS 2130 _mm256_cvttps_epi32(__m256 __a) 2131 { 2132 return (__m256i)__builtin_convertvector((__v8sf) __a, __v8si); 2133 } 2134 2135 static __inline double __DEFAULT_FN_ATTRS 2136 _mm256_cvtsd_f64(__m256d __a) 2137 { 2138 return __a[0]; 2139 } 2140 2141 static __inline int __DEFAULT_FN_ATTRS 2142 _mm256_cvtsi256_si32(__m256i __a) 2143 { 2144 __v8si __b = (__v8si)__a; 2145 return __b[0]; 2146 } 2147 2148 static __inline float __DEFAULT_FN_ATTRS 2149 _mm256_cvtss_f32(__m256 __a) 2150 { 2151 return __a[0]; 2152 } 2153 2154 /* Vector replicate */ 2155 static __inline __m256 __DEFAULT_FN_ATTRS 2156 _mm256_movehdup_ps(__m256 __a) 2157 { 2158 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); 2159 } 2160 2161 static __inline __m256 __DEFAULT_FN_ATTRS 2162 _mm256_moveldup_ps(__m256 __a) 2163 { 2164 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); 2165 } 2166 2167 static __inline __m256d __DEFAULT_FN_ATTRS 2168 _mm256_movedup_pd(__m256d __a) 2169 { 2170 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); 2171 } 2172 2173 /* Unpack and Interleave */ 2174 static __inline __m256d __DEFAULT_FN_ATTRS 2175 _mm256_unpackhi_pd(__m256d __a, __m256d __b) 2176 { 2177 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); 2178 } 2179 2180 static __inline __m256d __DEFAULT_FN_ATTRS 2181 _mm256_unpacklo_pd(__m256d __a, __m256d __b) 2182 { 2183 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); 2184 } 2185 2186 static __inline __m256 __DEFAULT_FN_ATTRS 2187 _mm256_unpackhi_ps(__m256 __a, __m256 __b) 2188 { 2189 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 2190 } 2191 2192 static __inline __m256 __DEFAULT_FN_ATTRS 2193 _mm256_unpacklo_ps(__m256 __a, __m256 __b) 2194 { 2195 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 2196 } 2197 2198 /* Bit Test */ 2199 static __inline int __DEFAULT_FN_ATTRS 2200 _mm_testz_pd(__m128d __a, __m128d __b) 2201 { 2202 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 2203 } 2204 2205 static __inline int __DEFAULT_FN_ATTRS 2206 _mm_testc_pd(__m128d __a, __m128d __b) 2207 { 2208 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 2209 } 2210 2211 static __inline int __DEFAULT_FN_ATTRS 2212 _mm_testnzc_pd(__m128d __a, __m128d __b) 2213 { 2214 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 2215 } 2216 2217 static __inline int __DEFAULT_FN_ATTRS 2218 _mm_testz_ps(__m128 __a, __m128 __b) 2219 { 2220 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 2221 } 2222 2223 static __inline int __DEFAULT_FN_ATTRS 2224 _mm_testc_ps(__m128 __a, __m128 __b) 2225 { 2226 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 2227 } 2228 2229 static __inline int __DEFAULT_FN_ATTRS 2230 _mm_testnzc_ps(__m128 __a, __m128 __b) 2231 { 2232 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 2233 } 2234 2235 static __inline int __DEFAULT_FN_ATTRS 2236 _mm256_testz_pd(__m256d __a, __m256d __b) 2237 { 2238 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 2239 } 2240 2241 static __inline int __DEFAULT_FN_ATTRS 2242 _mm256_testc_pd(__m256d __a, __m256d __b) 2243 { 2244 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 2245 } 2246 2247 static __inline int __DEFAULT_FN_ATTRS 2248 _mm256_testnzc_pd(__m256d __a, __m256d __b) 2249 { 2250 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 2251 } 2252 2253 static __inline int __DEFAULT_FN_ATTRS 2254 _mm256_testz_ps(__m256 __a, __m256 __b) 2255 { 2256 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 2257 } 2258 2259 static __inline int __DEFAULT_FN_ATTRS 2260 _mm256_testc_ps(__m256 __a, __m256 __b) 2261 { 2262 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 2263 } 2264 2265 static __inline int __DEFAULT_FN_ATTRS 2266 _mm256_testnzc_ps(__m256 __a, __m256 __b) 2267 { 2268 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 2269 } 2270 2271 static __inline int __DEFAULT_FN_ATTRS 2272 _mm256_testz_si256(__m256i __a, __m256i __b) 2273 { 2274 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 2275 } 2276 2277 static __inline int __DEFAULT_FN_ATTRS 2278 _mm256_testc_si256(__m256i __a, __m256i __b) 2279 { 2280 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 2281 } 2282 2283 static __inline int __DEFAULT_FN_ATTRS 2284 _mm256_testnzc_si256(__m256i __a, __m256i __b) 2285 { 2286 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 2287 } 2288 2289 /* Vector extract sign mask */ 2290 static __inline int __DEFAULT_FN_ATTRS 2291 _mm256_movemask_pd(__m256d __a) 2292 { 2293 return __builtin_ia32_movmskpd256((__v4df)__a); 2294 } 2295 2296 static __inline int __DEFAULT_FN_ATTRS 2297 _mm256_movemask_ps(__m256 __a) 2298 { 2299 return __builtin_ia32_movmskps256((__v8sf)__a); 2300 } 2301 2302 /* Vector __zero */ 2303 static __inline void __DEFAULT_FN_ATTRS 2304 _mm256_zeroall(void) 2305 { 2306 __builtin_ia32_vzeroall(); 2307 } 2308 2309 static __inline void __DEFAULT_FN_ATTRS 2310 _mm256_zeroupper(void) 2311 { 2312 __builtin_ia32_vzeroupper(); 2313 } 2314 2315 /* Vector load with broadcast */ 2316 static __inline __m128 __DEFAULT_FN_ATTRS 2317 _mm_broadcast_ss(float const *__a) 2318 { 2319 float __f = *__a; 2320 return (__m128)(__v4sf){ __f, __f, __f, __f }; 2321 } 2322 2323 static __inline __m256d __DEFAULT_FN_ATTRS 2324 _mm256_broadcast_sd(double const *__a) 2325 { 2326 double __d = *__a; 2327 return (__m256d)(__v4df){ __d, __d, __d, __d }; 2328 } 2329 2330 static __inline __m256 __DEFAULT_FN_ATTRS 2331 _mm256_broadcast_ss(float const *__a) 2332 { 2333 float __f = *__a; 2334 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 2335 } 2336 2337 static __inline __m256d __DEFAULT_FN_ATTRS 2338 _mm256_broadcast_pd(__m128d const *__a) 2339 { 2340 return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a); 2341 } 2342 2343 static __inline __m256 __DEFAULT_FN_ATTRS 2344 _mm256_broadcast_ps(__m128 const *__a) 2345 { 2346 return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a); 2347 } 2348 2349 /* SIMD load ops */ 2350 static __inline __m256d __DEFAULT_FN_ATTRS 2351 _mm256_load_pd(double const *__p) 2352 { 2353 return *(__m256d *)__p; 2354 } 2355 2356 static __inline __m256 __DEFAULT_FN_ATTRS 2357 _mm256_load_ps(float const *__p) 2358 { 2359 return *(__m256 *)__p; 2360 } 2361 2362 static __inline __m256d __DEFAULT_FN_ATTRS 2363 _mm256_loadu_pd(double const *__p) 2364 { 2365 struct __loadu_pd { 2366 __m256d __v; 2367 } __attribute__((__packed__, __may_alias__)); 2368 return ((struct __loadu_pd*)__p)->__v; 2369 } 2370 2371 static __inline __m256 __DEFAULT_FN_ATTRS 2372 _mm256_loadu_ps(float const *__p) 2373 { 2374 struct __loadu_ps { 2375 __m256 __v; 2376 } __attribute__((__packed__, __may_alias__)); 2377 return ((struct __loadu_ps*)__p)->__v; 2378 } 2379 2380 static __inline __m256i __DEFAULT_FN_ATTRS 2381 _mm256_load_si256(__m256i const *__p) 2382 { 2383 return *__p; 2384 } 2385 2386 static __inline __m256i __DEFAULT_FN_ATTRS 2387 _mm256_loadu_si256(__m256i const *__p) 2388 { 2389 struct __loadu_si256 { 2390 __m256i __v; 2391 } __attribute__((__packed__, __may_alias__)); 2392 return ((struct __loadu_si256*)__p)->__v; 2393 } 2394 2395 static __inline __m256i __DEFAULT_FN_ATTRS 2396 _mm256_lddqu_si256(__m256i const *__p) 2397 { 2398 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 2399 } 2400 2401 /* SIMD store ops */ 2402 static __inline void __DEFAULT_FN_ATTRS 2403 _mm256_store_pd(double *__p, __m256d __a) 2404 { 2405 *(__m256d *)__p = __a; 2406 } 2407 2408 static __inline void __DEFAULT_FN_ATTRS 2409 _mm256_store_ps(float *__p, __m256 __a) 2410 { 2411 *(__m256 *)__p = __a; 2412 } 2413 2414 static __inline void __DEFAULT_FN_ATTRS 2415 _mm256_storeu_pd(double *__p, __m256d __a) 2416 { 2417 struct __storeu_pd { 2418 __m256d __v; 2419 } __attribute__((__packed__, __may_alias__)); 2420 ((struct __storeu_pd*)__p)->__v = __a; 2421 } 2422 2423 static __inline void __DEFAULT_FN_ATTRS 2424 _mm256_storeu_ps(float *__p, __m256 __a) 2425 { 2426 struct __storeu_ps { 2427 __m256 __v; 2428 } __attribute__((__packed__, __may_alias__)); 2429 ((struct __storeu_ps*)__p)->__v = __a; 2430 } 2431 2432 static __inline void __DEFAULT_FN_ATTRS 2433 _mm256_store_si256(__m256i *__p, __m256i __a) 2434 { 2435 *__p = __a; 2436 } 2437 2438 static __inline void __DEFAULT_FN_ATTRS 2439 _mm256_storeu_si256(__m256i *__p, __m256i __a) 2440 { 2441 struct __storeu_si256 { 2442 __m256i __v; 2443 } __attribute__((__packed__, __may_alias__)); 2444 ((struct __storeu_si256*)__p)->__v = __a; 2445 } 2446 2447 /* Conditional load ops */ 2448 static __inline __m128d __DEFAULT_FN_ATTRS 2449 _mm_maskload_pd(double const *__p, __m128i __m) 2450 { 2451 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 2452 } 2453 2454 static __inline __m256d __DEFAULT_FN_ATTRS 2455 _mm256_maskload_pd(double const *__p, __m256i __m) 2456 { 2457 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 2458 (__v4di)__m); 2459 } 2460 2461 static __inline __m128 __DEFAULT_FN_ATTRS 2462 _mm_maskload_ps(float const *__p, __m128i __m) 2463 { 2464 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 2465 } 2466 2467 static __inline __m256 __DEFAULT_FN_ATTRS 2468 _mm256_maskload_ps(float const *__p, __m256i __m) 2469 { 2470 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 2471 } 2472 2473 /* Conditional store ops */ 2474 static __inline void __DEFAULT_FN_ATTRS 2475 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 2476 { 2477 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 2478 } 2479 2480 static __inline void __DEFAULT_FN_ATTRS 2481 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 2482 { 2483 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 2484 } 2485 2486 static __inline void __DEFAULT_FN_ATTRS 2487 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 2488 { 2489 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 2490 } 2491 2492 static __inline void __DEFAULT_FN_ATTRS 2493 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 2494 { 2495 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 2496 } 2497 2498 /* Cacheability support ops */ 2499 static __inline void __DEFAULT_FN_ATTRS 2500 _mm256_stream_si256(__m256i *__a, __m256i __b) 2501 { 2502 __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a); 2503 } 2504 2505 static __inline void __DEFAULT_FN_ATTRS 2506 _mm256_stream_pd(double *__a, __m256d __b) 2507 { 2508 __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a); 2509 } 2510 2511 static __inline void __DEFAULT_FN_ATTRS 2512 _mm256_stream_ps(float *__p, __m256 __a) 2513 { 2514 __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p); 2515 } 2516 2517 /* Create vectors */ 2518 static __inline__ __m256d __DEFAULT_FN_ATTRS 2519 _mm256_undefined_pd(void) 2520 { 2521 return (__m256d)__builtin_ia32_undef256(); 2522 } 2523 2524 static __inline__ __m256 __DEFAULT_FN_ATTRS 2525 _mm256_undefined_ps(void) 2526 { 2527 return (__m256)__builtin_ia32_undef256(); 2528 } 2529 2530 static __inline__ __m256i __DEFAULT_FN_ATTRS 2531 _mm256_undefined_si256(void) 2532 { 2533 return (__m256i)__builtin_ia32_undef256(); 2534 } 2535 2536 static __inline __m256d __DEFAULT_FN_ATTRS 2537 _mm256_set_pd(double __a, double __b, double __c, double __d) 2538 { 2539 return (__m256d){ __d, __c, __b, __a }; 2540 } 2541 2542 static __inline __m256 __DEFAULT_FN_ATTRS 2543 _mm256_set_ps(float __a, float __b, float __c, float __d, 2544 float __e, float __f, float __g, float __h) 2545 { 2546 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 2547 } 2548 2549 static __inline __m256i __DEFAULT_FN_ATTRS 2550 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 2551 int __i4, int __i5, int __i6, int __i7) 2552 { 2553 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 2554 } 2555 2556 static __inline __m256i __DEFAULT_FN_ATTRS 2557 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 2558 short __w11, short __w10, short __w09, short __w08, 2559 short __w07, short __w06, short __w05, short __w04, 2560 short __w03, short __w02, short __w01, short __w00) 2561 { 2562 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 2563 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 2564 } 2565 2566 static __inline __m256i __DEFAULT_FN_ATTRS 2567 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 2568 char __b27, char __b26, char __b25, char __b24, 2569 char __b23, char __b22, char __b21, char __b20, 2570 char __b19, char __b18, char __b17, char __b16, 2571 char __b15, char __b14, char __b13, char __b12, 2572 char __b11, char __b10, char __b09, char __b08, 2573 char __b07, char __b06, char __b05, char __b04, 2574 char __b03, char __b02, char __b01, char __b00) 2575 { 2576 return (__m256i)(__v32qi){ 2577 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 2578 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 2579 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 2580 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 2581 }; 2582 } 2583 2584 static __inline __m256i __DEFAULT_FN_ATTRS 2585 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 2586 { 2587 return (__m256i)(__v4di){ __d, __c, __b, __a }; 2588 } 2589 2590 /* Create vectors with elements in reverse order */ 2591 static __inline __m256d __DEFAULT_FN_ATTRS 2592 _mm256_setr_pd(double __a, double __b, double __c, double __d) 2593 { 2594 return (__m256d){ __a, __b, __c, __d }; 2595 } 2596 2597 static __inline __m256 __DEFAULT_FN_ATTRS 2598 _mm256_setr_ps(float __a, float __b, float __c, float __d, 2599 float __e, float __f, float __g, float __h) 2600 { 2601 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; 2602 } 2603 2604 static __inline __m256i __DEFAULT_FN_ATTRS 2605 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 2606 int __i4, int __i5, int __i6, int __i7) 2607 { 2608 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; 2609 } 2610 2611 static __inline __m256i __DEFAULT_FN_ATTRS 2612 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 2613 short __w11, short __w10, short __w09, short __w08, 2614 short __w07, short __w06, short __w05, short __w04, 2615 short __w03, short __w02, short __w01, short __w00) 2616 { 2617 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, 2618 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; 2619 } 2620 2621 static __inline __m256i __DEFAULT_FN_ATTRS 2622 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 2623 char __b27, char __b26, char __b25, char __b24, 2624 char __b23, char __b22, char __b21, char __b20, 2625 char __b19, char __b18, char __b17, char __b16, 2626 char __b15, char __b14, char __b13, char __b12, 2627 char __b11, char __b10, char __b09, char __b08, 2628 char __b07, char __b06, char __b05, char __b04, 2629 char __b03, char __b02, char __b01, char __b00) 2630 { 2631 return (__m256i)(__v32qi){ 2632 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, 2633 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, 2634 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, 2635 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; 2636 } 2637 2638 static __inline __m256i __DEFAULT_FN_ATTRS 2639 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 2640 { 2641 return (__m256i)(__v4di){ __a, __b, __c, __d }; 2642 } 2643 2644 /* Create vectors with repeated elements */ 2645 static __inline __m256d __DEFAULT_FN_ATTRS 2646 _mm256_set1_pd(double __w) 2647 { 2648 return (__m256d){ __w, __w, __w, __w }; 2649 } 2650 2651 static __inline __m256 __DEFAULT_FN_ATTRS 2652 _mm256_set1_ps(float __w) 2653 { 2654 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; 2655 } 2656 2657 static __inline __m256i __DEFAULT_FN_ATTRS 2658 _mm256_set1_epi32(int __i) 2659 { 2660 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; 2661 } 2662 2663 static __inline __m256i __DEFAULT_FN_ATTRS 2664 _mm256_set1_epi16(short __w) 2665 { 2666 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, 2667 __w, __w, __w, __w, __w, __w }; 2668 } 2669 2670 static __inline __m256i __DEFAULT_FN_ATTRS 2671 _mm256_set1_epi8(char __b) 2672 { 2673 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 2674 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 2675 __b, __b, __b, __b, __b, __b, __b }; 2676 } 2677 2678 static __inline __m256i __DEFAULT_FN_ATTRS 2679 _mm256_set1_epi64x(long long __q) 2680 { 2681 return (__m256i)(__v4di){ __q, __q, __q, __q }; 2682 } 2683 2684 /* Create __zeroed vectors */ 2685 static __inline __m256d __DEFAULT_FN_ATTRS 2686 _mm256_setzero_pd(void) 2687 { 2688 return (__m256d){ 0, 0, 0, 0 }; 2689 } 2690 2691 static __inline __m256 __DEFAULT_FN_ATTRS 2692 _mm256_setzero_ps(void) 2693 { 2694 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 2695 } 2696 2697 static __inline __m256i __DEFAULT_FN_ATTRS 2698 _mm256_setzero_si256(void) 2699 { 2700 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 2701 } 2702 2703 /* Cast between vector types */ 2704 static __inline __m256 __DEFAULT_FN_ATTRS 2705 _mm256_castpd_ps(__m256d __a) 2706 { 2707 return (__m256)__a; 2708 } 2709 2710 static __inline __m256i __DEFAULT_FN_ATTRS 2711 _mm256_castpd_si256(__m256d __a) 2712 { 2713 return (__m256i)__a; 2714 } 2715 2716 static __inline __m256d __DEFAULT_FN_ATTRS 2717 _mm256_castps_pd(__m256 __a) 2718 { 2719 return (__m256d)__a; 2720 } 2721 2722 static __inline __m256i __DEFAULT_FN_ATTRS 2723 _mm256_castps_si256(__m256 __a) 2724 { 2725 return (__m256i)__a; 2726 } 2727 2728 static __inline __m256 __DEFAULT_FN_ATTRS 2729 _mm256_castsi256_ps(__m256i __a) 2730 { 2731 return (__m256)__a; 2732 } 2733 2734 static __inline __m256d __DEFAULT_FN_ATTRS 2735 _mm256_castsi256_pd(__m256i __a) 2736 { 2737 return (__m256d)__a; 2738 } 2739 2740 static __inline __m128d __DEFAULT_FN_ATTRS 2741 _mm256_castpd256_pd128(__m256d __a) 2742 { 2743 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); 2744 } 2745 2746 static __inline __m128 __DEFAULT_FN_ATTRS 2747 _mm256_castps256_ps128(__m256 __a) 2748 { 2749 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); 2750 } 2751 2752 static __inline __m128i __DEFAULT_FN_ATTRS 2753 _mm256_castsi256_si128(__m256i __a) 2754 { 2755 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); 2756 } 2757 2758 static __inline __m256d __DEFAULT_FN_ATTRS 2759 _mm256_castpd128_pd256(__m128d __a) 2760 { 2761 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); 2762 } 2763 2764 static __inline __m256 __DEFAULT_FN_ATTRS 2765 _mm256_castps128_ps256(__m128 __a) 2766 { 2767 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); 2768 } 2769 2770 static __inline __m256i __DEFAULT_FN_ATTRS 2771 _mm256_castsi128_si256(__m128i __a) 2772 { 2773 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); 2774 } 2775 2776 /* 2777 Vector insert. 2778 We use macros rather than inlines because we only want to accept 2779 invocations where the immediate M is a constant expression. 2780 */ 2781 #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ 2782 (__m256)__builtin_shufflevector( \ 2783 (__v8sf)(__m256)(V1), \ 2784 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ 2785 (((M) & 1) ? 0 : 8), \ 2786 (((M) & 1) ? 1 : 9), \ 2787 (((M) & 1) ? 2 : 10), \ 2788 (((M) & 1) ? 3 : 11), \ 2789 (((M) & 1) ? 8 : 4), \ 2790 (((M) & 1) ? 9 : 5), \ 2791 (((M) & 1) ? 10 : 6), \ 2792 (((M) & 1) ? 11 : 7) );}) 2793 2794 #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ 2795 (__m256d)__builtin_shufflevector( \ 2796 (__v4df)(__m256d)(V1), \ 2797 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ 2798 (((M) & 1) ? 0 : 4), \ 2799 (((M) & 1) ? 1 : 5), \ 2800 (((M) & 1) ? 4 : 2), \ 2801 (((M) & 1) ? 5 : 3) );}) 2802 2803 #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ 2804 (__m256i)__builtin_shufflevector( \ 2805 (__v4di)(__m256i)(V1), \ 2806 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ 2807 (((M) & 1) ? 0 : 4), \ 2808 (((M) & 1) ? 1 : 5), \ 2809 (((M) & 1) ? 4 : 2), \ 2810 (((M) & 1) ? 5 : 3) );}) 2811 2812 /* 2813 Vector extract. 2814 We use macros rather than inlines because we only want to accept 2815 invocations where the immediate M is a constant expression. 2816 */ 2817 #define _mm256_extractf128_ps(V, M) __extension__ ({ \ 2818 (__m128)__builtin_shufflevector( \ 2819 (__v8sf)(__m256)(V), \ 2820 (__v8sf)(_mm256_undefined_ps()), \ 2821 (((M) & 1) ? 4 : 0), \ 2822 (((M) & 1) ? 5 : 1), \ 2823 (((M) & 1) ? 6 : 2), \ 2824 (((M) & 1) ? 7 : 3) );}) 2825 2826 #define _mm256_extractf128_pd(V, M) __extension__ ({ \ 2827 (__m128d)__builtin_shufflevector( \ 2828 (__v4df)(__m256d)(V), \ 2829 (__v4df)(_mm256_undefined_pd()), \ 2830 (((M) & 1) ? 2 : 0), \ 2831 (((M) & 1) ? 3 : 1) );}) 2832 2833 #define _mm256_extractf128_si256(V, M) __extension__ ({ \ 2834 (__m128i)__builtin_shufflevector( \ 2835 (__v4di)(__m256i)(V), \ 2836 (__v4di)(_mm256_undefined_si256()), \ 2837 (((M) & 1) ? 2 : 0), \ 2838 (((M) & 1) ? 3 : 1) );}) 2839 2840 /* SIMD load ops (unaligned) */ 2841 static __inline __m256 __DEFAULT_FN_ATTRS 2842 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 2843 { 2844 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo)); 2845 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); 2846 } 2847 2848 static __inline __m256d __DEFAULT_FN_ATTRS 2849 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 2850 { 2851 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo)); 2852 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); 2853 } 2854 2855 static __inline __m256i __DEFAULT_FN_ATTRS 2856 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) 2857 { 2858 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); 2859 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); 2860 } 2861 2862 /* SIMD store ops (unaligned) */ 2863 static __inline void __DEFAULT_FN_ATTRS 2864 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 2865 { 2866 __m128 __v128; 2867 2868 __v128 = _mm256_castps256_ps128(__a); 2869 _mm_storeu_ps(__addr_lo, __v128); 2870 __v128 = _mm256_extractf128_ps(__a, 1); 2871 _mm_storeu_ps(__addr_hi, __v128); 2872 } 2873 2874 static __inline void __DEFAULT_FN_ATTRS 2875 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 2876 { 2877 __m128d __v128; 2878 2879 __v128 = _mm256_castpd256_pd128(__a); 2880 _mm_storeu_pd(__addr_lo, __v128); 2881 __v128 = _mm256_extractf128_pd(__a, 1); 2882 _mm_storeu_pd(__addr_hi, __v128); 2883 } 2884 2885 static __inline void __DEFAULT_FN_ATTRS 2886 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) 2887 { 2888 __m128i __v128; 2889 2890 __v128 = _mm256_castsi256_si128(__a); 2891 _mm_storeu_si128(__addr_lo, __v128); 2892 __v128 = _mm256_extractf128_si256(__a, 1); 2893 _mm_storeu_si128(__addr_hi, __v128); 2894 } 2895 2896 static __inline __m256 __DEFAULT_FN_ATTRS 2897 _mm256_set_m128 (__m128 __hi, __m128 __lo) { 2898 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); 2899 } 2900 2901 static __inline __m256d __DEFAULT_FN_ATTRS 2902 _mm256_set_m128d (__m128d __hi, __m128d __lo) { 2903 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 2904 } 2905 2906 static __inline __m256i __DEFAULT_FN_ATTRS 2907 _mm256_set_m128i (__m128i __hi, __m128i __lo) { 2908 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 2909 } 2910 2911 static __inline __m256 __DEFAULT_FN_ATTRS 2912 _mm256_setr_m128 (__m128 __lo, __m128 __hi) { 2913 return _mm256_set_m128(__hi, __lo); 2914 } 2915 2916 static __inline __m256d __DEFAULT_FN_ATTRS 2917 _mm256_setr_m128d (__m128d __lo, __m128d __hi) { 2918 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 2919 } 2920 2921 static __inline __m256i __DEFAULT_FN_ATTRS 2922 _mm256_setr_m128i (__m128i __lo, __m128i __hi) { 2923 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 2924 } 2925 2926 #undef __DEFAULT_FN_ATTRS 2927 2928 #endif /* __AVXINTRIN_H */ 2929