1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __IMMINTRIN_H 25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26 #endif 27 28 #ifndef __AVXINTRIN_H 29 #define __AVXINTRIN_H 30 31 typedef double __v4df __attribute__ ((__vector_size__ (32))); 32 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 33 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 34 typedef int __v8si __attribute__ ((__vector_size__ (32))); 35 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 36 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 37 38 /* We need an explicitly signed variant for char. Note that this shouldn't 39 * appear in the interface though. */ 40 typedef signed char __v32qs __attribute__((__vector_size__(32))); 41 42 typedef float __m256 __attribute__ ((__vector_size__ (32))); 43 typedef double __m256d __attribute__((__vector_size__(32))); 44 typedef long long __m256i __attribute__((__vector_size__(32))); 45 46 /* Define the default attributes for the functions in this file. */ 47 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 48 49 /* Arithmetic */ 50 /// \brief Adds two 256-bit vectors of [4 x double]. 51 /// 52 /// \headerfile <x86intrin.h> 53 /// 54 /// This intrinsic corresponds to the \c VADDPD / ADDPD instruction. 55 /// 56 /// \param __a 57 /// A 256-bit vector of [4 x double] containing one of the source operands. 58 /// \param __b 59 /// A 256-bit vector of [4 x double] containing one of the source operands. 60 /// \returns A 256-bit vector of [4 x double] containing the sums of both 61 /// operands. 62 static __inline __m256d __DEFAULT_FN_ATTRS 63 _mm256_add_pd(__m256d __a, __m256d __b) 64 { 65 return __a+__b; 66 } 67 68 /// \brief Adds two 256-bit vectors of [8 x float]. 69 /// 70 /// \headerfile <x86intrin.h> 71 /// 72 /// This intrinsic corresponds to the \c VADDPS / ADDPS instruction. 73 /// 74 /// \param __a 75 /// A 256-bit vector of [8 x float] containing one of the source operands. 76 /// \param __b 77 /// A 256-bit vector of [8 x float] containing one of the source operands. 78 /// \returns A 256-bit vector of [8 x float] containing the sums of both 79 /// operands. 80 static __inline __m256 __DEFAULT_FN_ATTRS 81 _mm256_add_ps(__m256 __a, __m256 __b) 82 { 83 return __a+__b; 84 } 85 86 /// \brief Subtracts two 256-bit vectors of [4 x double]. 87 /// 88 /// \headerfile <x86intrin.h> 89 /// 90 /// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction. 91 /// 92 /// \param __a 93 /// A 256-bit vector of [4 x double] containing the minuend. 94 /// \param __b 95 /// A 256-bit vector of [4 x double] containing the subtrahend. 96 /// \returns A 256-bit vector of [4 x double] containing the differences between 97 /// both operands. 98 static __inline __m256d __DEFAULT_FN_ATTRS 99 _mm256_sub_pd(__m256d __a, __m256d __b) 100 { 101 return __a-__b; 102 } 103 104 /// \brief Subtracts two 256-bit vectors of [8 x float]. 105 /// 106 /// \headerfile <x86intrin.h> 107 /// 108 /// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction. 109 /// 110 /// \param __a 111 /// A 256-bit vector of [8 x float] containing the minuend. 112 /// \param __b 113 /// A 256-bit vector of [8 x float] containing the subtrahend. 114 /// \returns A 256-bit vector of [8 x float] containing the differences between 115 /// both operands. 116 static __inline __m256 __DEFAULT_FN_ATTRS 117 _mm256_sub_ps(__m256 __a, __m256 __b) 118 { 119 return __a-__b; 120 } 121 122 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 123 /// two 256-bit vectors of [4 x double]. 124 /// 125 /// \headerfile <x86intrin.h> 126 /// 127 /// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction. 128 /// 129 /// \param __a 130 /// A 256-bit vector of [4 x double] containing the left source operand. 131 /// \param __b 132 /// A 256-bit vector of [4 x double] containing the right source operand. 133 /// \returns A 256-bit vector of [4 x double] containing the alternating sums 134 /// and differences between both operands. 135 static __inline __m256d __DEFAULT_FN_ATTRS 136 _mm256_addsub_pd(__m256d __a, __m256d __b) 137 { 138 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 139 } 140 141 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 142 /// two 256-bit vectors of [8 x float]. 143 /// 144 /// \headerfile <x86intrin.h> 145 /// 146 /// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction. 147 /// 148 /// \param __a 149 /// A 256-bit vector of [8 x float] containing the left source operand. 150 /// \param __b 151 /// A 256-bit vector of [8 x float] containing the right source operand. 152 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and 153 /// differences between both operands. 154 static __inline __m256 __DEFAULT_FN_ATTRS 155 _mm256_addsub_ps(__m256 __a, __m256 __b) 156 { 157 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 158 } 159 160 /// \brief Divides two 256-bit vectors of [4 x double]. 161 /// 162 /// \headerfile <x86intrin.h> 163 /// 164 /// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction. 165 /// 166 /// \param __a 167 /// A 256-bit vector of [4 x double] containing the dividend. 168 /// \param __b 169 /// A 256-bit vector of [4 x double] containing the divisor. 170 /// \returns A 256-bit vector of [4 x double] containing the quotients between 171 /// both operands. 172 static __inline __m256d __DEFAULT_FN_ATTRS 173 _mm256_div_pd(__m256d __a, __m256d __b) 174 { 175 return __a / __b; 176 } 177 178 /// \brief Divides two 256-bit vectors of [8 x float]. 179 /// 180 /// \headerfile <x86intrin.h> 181 /// 182 /// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction. 183 /// 184 /// \param __a 185 /// A 256-bit vector of [8 x float] containing the dividend. 186 /// \param __b 187 /// A 256-bit vector of [8 x float] containing the divisor. 188 /// \returns A 256-bit vector of [8 x float] containing the quotients between 189 /// both operands. 190 static __inline __m256 __DEFAULT_FN_ATTRS 191 _mm256_div_ps(__m256 __a, __m256 __b) 192 { 193 return __a / __b; 194 } 195 196 /// \brief Compares two 256-bit vectors of [4 x double] and returns the greater 197 /// of each pair of values. 198 /// 199 /// \headerfile <x86intrin.h> 200 /// 201 /// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction. 202 /// 203 /// \param __a 204 /// A 256-bit vector of [4 x double] containing one of the operands. 205 /// \param __b 206 /// A 256-bit vector of [4 x double] containing one of the operands. 207 /// \returns A 256-bit vector of [4 x double] containing the maximum values 208 /// between both operands. 209 static __inline __m256d __DEFAULT_FN_ATTRS 210 _mm256_max_pd(__m256d __a, __m256d __b) 211 { 212 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 213 } 214 215 /// \brief Compares two 256-bit vectors of [8 x float] and returns the greater 216 /// of each pair of values. 217 /// 218 /// \headerfile <x86intrin.h> 219 /// 220 /// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction. 221 /// 222 /// \param __a 223 /// A 256-bit vector of [8 x float] containing one of the operands. 224 /// \param __b 225 /// A 256-bit vector of [8 x float] containing one of the operands. 226 /// \returns A 256-bit vector of [8 x float] containing the maximum values 227 /// between both operands. 228 static __inline __m256 __DEFAULT_FN_ATTRS 229 _mm256_max_ps(__m256 __a, __m256 __b) 230 { 231 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 232 } 233 234 /// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser 235 /// of each pair of values. 236 /// 237 /// \headerfile <x86intrin.h> 238 /// 239 /// This intrinsic corresponds to the \c VMINPD / MINPD instruction. 240 /// 241 /// \param __a 242 /// A 256-bit vector of [4 x double] containing one of the operands. 243 /// \param __b 244 /// A 256-bit vector of [4 x double] containing one of the operands. 245 /// \returns A 256-bit vector of [4 x double] containing the minimum values 246 /// between both operands. 247 static __inline __m256d __DEFAULT_FN_ATTRS 248 _mm256_min_pd(__m256d __a, __m256d __b) 249 { 250 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 251 } 252 253 /// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser 254 /// of each pair of values. 255 /// 256 /// \headerfile <x86intrin.h> 257 /// 258 /// This intrinsic corresponds to the \c VMINPS / MINPS instruction. 259 /// 260 /// \param __a 261 /// A 256-bit vector of [8 x float] containing one of the operands. 262 /// \param __b 263 /// A 256-bit vector of [8 x float] containing one of the operands. 264 /// \returns A 256-bit vector of [8 x float] containing the minimum values 265 /// between both operands. 266 static __inline __m256 __DEFAULT_FN_ATTRS 267 _mm256_min_ps(__m256 __a, __m256 __b) 268 { 269 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 270 } 271 272 /// \brief Multiplies two 256-bit vectors of [4 x double]. 273 /// 274 /// \headerfile <x86intrin.h> 275 /// 276 /// This intrinsic corresponds to the \c VMULPD / MULPD instruction. 277 /// 278 /// \param __a 279 /// A 256-bit vector of [4 x double] containing one of the operands. 280 /// \param __b 281 /// A 256-bit vector of [4 x double] containing one of the operands. 282 /// \returns A 256-bit vector of [4 x double] containing the products between 283 /// both operands. 284 static __inline __m256d __DEFAULT_FN_ATTRS 285 _mm256_mul_pd(__m256d __a, __m256d __b) 286 { 287 return __a * __b; 288 } 289 290 /// \brief Multiplies two 256-bit vectors of [8 x float]. 291 /// 292 /// \headerfile <x86intrin.h> 293 /// 294 /// This intrinsic corresponds to the \c VMULPS / MULPS instruction. 295 /// 296 /// \param __a 297 /// A 256-bit vector of [8 x float] containing one of the operands. 298 /// \param __b 299 /// A 256-bit vector of [8 x float] containing one of the operands. 300 /// \returns A 256-bit vector of [8 x float] containing the products between 301 /// both operands. 302 static __inline __m256 __DEFAULT_FN_ATTRS 303 _mm256_mul_ps(__m256 __a, __m256 __b) 304 { 305 return __a * __b; 306 } 307 308 /// \brief Calculates the square roots of the values stored in a 256-bit vector 309 /// of [4 x double]. 310 /// 311 /// \headerfile <x86intrin.h> 312 /// 313 /// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction. 314 /// 315 /// \param __a 316 /// A 256-bit vector of [4 x double]. 317 /// \returns A 256-bit vector of [4 x double] containing the square roots of the 318 /// values in the operand. 319 static __inline __m256d __DEFAULT_FN_ATTRS 320 _mm256_sqrt_pd(__m256d __a) 321 { 322 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 323 } 324 325 /// \brief Calculates the square roots of the values stored in a 256-bit vector 326 /// of [8 x float]. 327 /// 328 /// \headerfile <x86intrin.h> 329 /// 330 /// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction. 331 /// 332 /// \param __a 333 /// A 256-bit vector of [8 x float]. 334 /// \returns A 256-bit vector of [8 x float] containing the square roots of the 335 /// values in the operand. 336 static __inline __m256 __DEFAULT_FN_ATTRS 337 _mm256_sqrt_ps(__m256 __a) 338 { 339 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 340 } 341 342 /// \brief Calculates the reciprocal square roots of the values stored in a 343 /// 256-bit vector of [8 x float]. 344 /// 345 /// \headerfile <x86intrin.h> 346 /// 347 /// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction. 348 /// 349 /// \param __a 350 /// A 256-bit vector of [8 x float]. 351 /// \returns A 256-bit vector of [8 x float] containing the reciprocal square 352 /// roots of the values in the operand. 353 static __inline __m256 __DEFAULT_FN_ATTRS 354 _mm256_rsqrt_ps(__m256 __a) 355 { 356 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 357 } 358 359 /// \brief Calculates the reciprocals of the values stored in a 256-bit vector 360 /// of [8 x float]. 361 /// 362 /// \headerfile <x86intrin.h> 363 /// 364 /// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction. 365 /// 366 /// \param __a 367 /// A 256-bit vector of [8 x float]. 368 /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 369 /// values in the operand. 370 static __inline __m256 __DEFAULT_FN_ATTRS 371 _mm256_rcp_ps(__m256 __a) 372 { 373 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 374 } 375 376 /// \brief Rounds the values stored in a 256-bit vector of [4 x double] as 377 /// specified by the byte operand. The source values are rounded to integer 378 /// values and returned as 64-bit double-precision floating-point values. 379 /// 380 /// \headerfile <x86intrin.h> 381 /// 382 /// \code 383 /// __m256d _mm256_round_pd(__m256d V, const int M); 384 /// \endcode 385 /// 386 /// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction. 387 /// 388 /// \param V 389 /// A 256-bit vector of [4 x double]. 390 /// \param M 391 /// An integer value that specifies the rounding operation. 392 /// Bits [7:4] are reserved. 393 /// Bit [3] is a precision exception value: 394 /// 0: A normal PE exception is used 395 /// 1: The PE field is not updated 396 /// Bit [2] is the rounding control source: 397 /// 0: Use bits [1:0] of M 398 /// 1: Use the current MXCSR setting 399 /// Bits [1:0] contain the rounding control definition: 400 /// 00: Nearest 401 /// 01: Downward (toward negative infinity) 402 /// 10: Upward (toward positive infinity) 403 /// 11: Truncated 404 /// \returns A 256-bit vector of [4 x double] containing the rounded values. 405 #define _mm256_round_pd(V, M) __extension__ ({ \ 406 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) 407 408 /// \brief Rounds the values stored in a 256-bit vector of [8 x float] as 409 /// specified by the byte operand. The source values are rounded to integer 410 /// values and returned as floating-point values. 411 /// 412 /// \headerfile <x86intrin.h> 413 /// 414 /// \code 415 /// __m256 _mm256_round_ps(__m256 V, const int M); 416 /// \endcode 417 /// 418 /// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction. 419 /// 420 /// \param V 421 /// A 256-bit vector of [8 x float]. 422 /// \param M 423 /// An integer value that specifies the rounding operation. 424 /// Bits [7:4] are reserved. 425 /// Bit [3] is a precision exception value: 426 /// 0: A normal PE exception is used 427 /// 1: The PE field is not updated 428 /// Bit [2] is the rounding control source: 429 /// 0: Use bits [1:0] of M 430 /// 1: Use the current MXCSR setting 431 /// Bits [1:0] contain the rounding control definition: 432 /// 00: Nearest 433 /// 01: Downward (toward negative infinity) 434 /// 10: Upward (toward positive infinity) 435 /// 11: Truncated 436 /// \returns A 256-bit vector of [8 x float] containing the rounded values. 437 #define _mm256_round_ps(V, M) __extension__ ({ \ 438 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) 439 440 /// \brief Round up the values stored in a 256-bit vector of [4 x double]. The 441 /// source values are rounded up to integer values and returned as 64-bit 442 /// double-precision floating-point values. 443 /// 444 /// \headerfile <x86intrin.h> 445 /// 446 /// \code 447 /// __m256d _mm256_ceil_pd(__m256d V); 448 /// \endcode 449 /// 450 /// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction. 451 /// 452 /// \param V 453 /// A 256-bit vector of [4 x double]. 454 /// \returns A 256-bit vector of [4 x double] containing the rounded up values. 455 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 456 457 /// \brief Round down the values stored in a 256-bit vector of [4 x double]. 458 /// The source values are rounded down to integer values and returned as 459 /// 64-bit double-precision floating-point values. 460 /// 461 /// \headerfile <x86intrin.h> 462 /// 463 /// \code 464 /// __m256d _mm256_floor_pd(__m256d V); 465 /// \endcode 466 /// 467 /// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction. 468 /// 469 /// \param V 470 /// A 256-bit vector of [4 x double]. 471 /// \returns A 256-bit vector of [4 x double] containing the rounded down 472 /// values. 473 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 474 475 /// \brief Round up the values stored in a 256-bit vector of [8 x float]. The 476 /// source values are rounded up to integer values and returned as 477 /// floating-point values. 478 /// 479 /// \headerfile <x86intrin.h> 480 /// 481 /// \code 482 /// __m256 _mm256_ceil_ps(__m256 V); 483 /// \endcode 484 /// 485 /// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction. 486 /// 487 /// \param V 488 /// A 256-bit vector of [8 x float]. 489 /// \returns A 256-bit vector of [8 x float] containing the rounded up values. 490 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 491 492 /// \brief Round down the values stored in a 256-bit vector of [8 x float]. The 493 /// source values are rounded down to integer values and returned as 494 /// floating-point values. 495 /// 496 /// \headerfile <x86intrin.h> 497 /// 498 /// \code 499 /// __m256 _mm256_floor_ps(__m256 V); 500 /// \endcode 501 /// 502 /// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction. 503 /// 504 /// \param V 505 /// A 256-bit vector of [8 x float]. 506 /// \returns A 256-bit vector of [8 x float] containing the rounded down values. 507 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 508 509 /* Logical */ 510 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double]. 511 /// 512 /// \headerfile <x86intrin.h> 513 /// 514 /// This intrinsic corresponds to the \c VANDPD / ANDPD instruction. 515 /// 516 /// \param __a 517 /// A 256-bit vector of [4 x double] containing one of the source operands. 518 /// \param __b 519 /// A 256-bit vector of [4 x double] containing one of the source operands. 520 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 521 /// values between both operands. 522 static __inline __m256d __DEFAULT_FN_ATTRS 523 _mm256_and_pd(__m256d __a, __m256d __b) 524 { 525 return (__m256d)((__v4di)__a & (__v4di)__b); 526 } 527 528 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float]. 529 /// 530 /// \headerfile <x86intrin.h> 531 /// 532 /// This intrinsic corresponds to the \c VANDPS / ANDPS instruction. 533 /// 534 /// \param __a 535 /// A 256-bit vector of [8 x float] containing one of the source operands. 536 /// \param __b 537 /// A 256-bit vector of [8 x float] containing one of the source operands. 538 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 539 /// values between both operands. 540 static __inline __m256 __DEFAULT_FN_ATTRS 541 _mm256_and_ps(__m256 __a, __m256 __b) 542 { 543 return (__m256)((__v8si)__a & (__v8si)__b); 544 } 545 546 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using 547 /// the one's complement of the values contained in the first source operand. 548 /// 549 /// \headerfile <x86intrin.h> 550 /// 551 /// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction. 552 /// 553 /// \param __a 554 /// A 256-bit vector of [4 x double] containing the left source operand. The 555 /// one's complement of this value is used in the bitwise AND. 556 /// \param __b 557 /// A 256-bit vector of [4 x double] containing the right source operand. 558 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 559 /// values of the second operand and the one's complement of the first 560 /// operand. 561 static __inline __m256d __DEFAULT_FN_ATTRS 562 _mm256_andnot_pd(__m256d __a, __m256d __b) 563 { 564 return (__m256d)(~(__v4di)__a & (__v4di)__b); 565 } 566 567 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using 568 /// the one's complement of the values contained in the first source operand. 569 /// 570 /// \headerfile <x86intrin.h> 571 /// 572 /// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction. 573 /// 574 /// \param __a 575 /// A 256-bit vector of [8 x float] containing the left source operand. The 576 /// one's complement of this value is used in the bitwise AND. 577 /// \param __b 578 /// A 256-bit vector of [8 x float] containing the right source operand. 579 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 580 /// values of the second operand and the one's complement of the first 581 /// operand. 582 static __inline __m256 __DEFAULT_FN_ATTRS 583 _mm256_andnot_ps(__m256 __a, __m256 __b) 584 { 585 return (__m256)(~(__v8si)__a & (__v8si)__b); 586 } 587 588 /// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double]. 589 /// 590 /// \headerfile <x86intrin.h> 591 /// 592 /// This intrinsic corresponds to the \c VORPD / ORPD instruction. 593 /// 594 /// \param __a 595 /// A 256-bit vector of [4 x double] containing one of the source operands. 596 /// \param __b 597 /// A 256-bit vector of [4 x double] containing one of the source operands. 598 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 599 /// values between both operands. 600 static __inline __m256d __DEFAULT_FN_ATTRS 601 _mm256_or_pd(__m256d __a, __m256d __b) 602 { 603 return (__m256d)((__v4di)__a | (__v4di)__b); 604 } 605 606 /// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float]. 607 /// 608 /// \headerfile <x86intrin.h> 609 /// 610 /// This intrinsic corresponds to the \c VORPS / ORPS instruction. 611 /// 612 /// \param __a 613 /// A 256-bit vector of [8 x float] containing one of the source operands. 614 /// \param __b 615 /// A 256-bit vector of [8 x float] containing one of the source operands. 616 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 617 /// values between both operands. 618 static __inline __m256 __DEFAULT_FN_ATTRS 619 _mm256_or_ps(__m256 __a, __m256 __b) 620 { 621 return (__m256)((__v8si)__a | (__v8si)__b); 622 } 623 624 /// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 625 /// 626 /// \headerfile <x86intrin.h> 627 /// 628 /// This intrinsic corresponds to the \c VXORPD / XORPD instruction. 629 /// 630 /// \param __a 631 /// A 256-bit vector of [4 x double] containing one of the source operands. 632 /// \param __b 633 /// A 256-bit vector of [4 x double] containing one of the source operands. 634 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 635 /// values between both operands. 636 static __inline __m256d __DEFAULT_FN_ATTRS 637 _mm256_xor_pd(__m256d __a, __m256d __b) 638 { 639 return (__m256d)((__v4di)__a ^ (__v4di)__b); 640 } 641 642 /// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 643 /// 644 /// \headerfile <x86intrin.h> 645 /// 646 /// This intrinsic corresponds to the \c VXORPS / XORPS instruction. 647 /// 648 /// \param __a 649 /// A 256-bit vector of [8 x float] containing one of the source operands. 650 /// \param __b 651 /// A 256-bit vector of [8 x float] containing one of the source operands. 652 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 653 /// values between both operands. 654 static __inline __m256 __DEFAULT_FN_ATTRS 655 _mm256_xor_ps(__m256 __a, __m256 __b) 656 { 657 return (__m256)((__v8si)__a ^ (__v8si)__b); 658 } 659 660 /* Horizontal arithmetic */ 661 /// \brief Horizontally adds the adjacent pairs of values contained in two 662 /// 256-bit vectors of [4 x double]. 663 /// 664 /// \headerfile <x86intrin.h> 665 /// 666 /// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction. 667 /// 668 /// \param __a 669 /// A 256-bit vector of [4 x double] containing one of the source operands. 670 /// The horizontal sums of the values are returned in the even-indexed 671 /// elements of a vector of [4 x double]. 672 /// \param __b 673 /// A 256-bit vector of [4 x double] containing one of the source operands. 674 /// The horizontal sums of the values are returned in the odd-indexed 675 /// elements of a vector of [4 x double]. 676 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 677 /// both operands. 678 static __inline __m256d __DEFAULT_FN_ATTRS 679 _mm256_hadd_pd(__m256d __a, __m256d __b) 680 { 681 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 682 } 683 684 /// \brief Horizontally adds the adjacent pairs of values contained in two 685 /// 256-bit vectors of [8 x float]. 686 /// 687 /// \headerfile <x86intrin.h> 688 /// 689 /// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction. 690 /// 691 /// \param __a 692 /// A 256-bit vector of [8 x float] containing one of the source operands. 693 /// The horizontal sums of the values are returned in the elements with 694 /// index 0, 1, 4, 5 of a vector of [8 x float]. 695 /// \param __b 696 /// A 256-bit vector of [8 x float] containing one of the source operands. 697 /// The horizontal sums of the values are returned in the elements with 698 /// index 2, 3, 6, 7 of a vector of [8 x float]. 699 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 700 /// both operands. 701 static __inline __m256 __DEFAULT_FN_ATTRS 702 _mm256_hadd_ps(__m256 __a, __m256 __b) 703 { 704 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 705 } 706 707 /// \brief Horizontally subtracts the adjacent pairs of values contained in two 708 /// 256-bit vectors of [4 x double]. 709 /// 710 /// \headerfile <x86intrin.h> 711 /// 712 /// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction. 713 /// 714 /// \param __a 715 /// A 256-bit vector of [4 x double] containing one of the source operands. 716 /// The horizontal differences between the values are returned in the 717 /// even-indexed elements of a vector of [4 x double]. 718 /// \param __b 719 /// A 256-bit vector of [4 x double] containing one of the source operands. 720 /// The horizontal differences between the values are returned in the 721 /// odd-indexed elements of a vector of [4 x double]. 722 /// \returns A 256-bit vector of [4 x double] containing the horizontal 723 /// differences of both operands. 724 static __inline __m256d __DEFAULT_FN_ATTRS 725 _mm256_hsub_pd(__m256d __a, __m256d __b) 726 { 727 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 728 } 729 730 /// \brief Horizontally subtracts the adjacent pairs of values contained in two 731 /// 256-bit vectors of [8 x float]. 732 /// 733 /// \headerfile <x86intrin.h> 734 /// 735 /// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction. 736 /// 737 /// \param __a 738 /// A 256-bit vector of [8 x float] containing one of the source operands. 739 /// The horizontal differences between the values are returned in the 740 /// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 741 /// \param __b 742 /// A 256-bit vector of [8 x float] containing one of the source operands. 743 /// The horizontal differences between the values are returned in the 744 /// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 745 /// \returns A 256-bit vector of [8 x float] containing the horizontal 746 /// differences of both operands. 747 static __inline __m256 __DEFAULT_FN_ATTRS 748 _mm256_hsub_ps(__m256 __a, __m256 __b) 749 { 750 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 751 } 752 753 /* Vector permutations */ 754 /// \brief Copies the values stored in a 128-bit vector of [2 x double] as 755 /// specified by the 128-bit integer vector operand. 756 /// 757 /// \headerfile <x86intrin.h> 758 /// 759 /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. 760 /// 761 /// \param __a 762 /// A 128-bit vector of [2 x double]. 763 /// \param __c 764 /// A 128-bit integer vector operand specifying how the values are to be 765 /// copied. 766 /// Bit [1]: 767 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the 768 /// returned vector 769 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 770 /// returned vector 771 /// Bit [65]: 772 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 773 /// returned vector 774 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 775 /// returned vector 776 /// \returns A 128-bit vector of [2 x double] containing the copied values. 777 static __inline __m128d __DEFAULT_FN_ATTRS 778 _mm_permutevar_pd(__m128d __a, __m128i __c) 779 { 780 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 781 } 782 783 /// \brief Copies the values stored in a 256-bit vector of [4 x double] as 784 /// specified by the 256-bit integer vector operand. 785 /// 786 /// \headerfile <x86intrin.h> 787 /// 788 /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. 789 /// 790 /// \param __a 791 /// A 256-bit vector of [4 x double]. 792 /// \param __c 793 /// A 256-bit integer vector operand specifying how the values are to be 794 /// copied. 795 /// Bit [1]: 796 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the 797 /// returned vector 798 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 799 /// returned vector 800 /// Bit [65]: 801 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 802 /// returned vector 803 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 804 /// returned vector 805 /// Bit [129]: 806 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 807 /// returned vector 808 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 809 /// returned vector 810 /// Bit [193]: 811 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 812 /// returned vector 813 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 814 /// returned vector 815 /// \returns A 256-bit vector of [4 x double] containing the copied values. 816 static __inline __m256d __DEFAULT_FN_ATTRS 817 _mm256_permutevar_pd(__m256d __a, __m256i __c) 818 { 819 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 820 } 821 822 /// \brief Copies the values stored in a 128-bit vector of [4 x float] as 823 /// specified by the 128-bit integer vector operand. 824 /// 825 /// \headerfile <x86intrin.h> 826 /// 827 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. 828 /// 829 /// \param __a 830 /// A 128-bit vector of [4 x float]. 831 /// \param __c 832 /// A 128-bit integer vector operand specifying how the values are to be 833 /// copied. 834 /// Bits [1:0]: 835 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 836 /// returned vector 837 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 838 /// returned vector 839 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 840 /// returned vector 841 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 842 /// returned vector 843 /// Bits [33:32]: 844 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 845 /// returned vector 846 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 847 /// returned vector 848 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 849 /// returned vector 850 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 851 /// returned vector 852 /// Bits [65:64]: 853 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 854 /// returned vector 855 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 856 /// returned vector 857 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 858 /// returned vector 859 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 860 /// returned vector 861 /// Bits [97:96]: 862 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 863 /// returned vector 864 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 865 /// returned vector 866 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 867 /// returned vector 868 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 869 /// returned vector 870 /// \returns A 128-bit vector of [4 x float] containing the copied values. 871 static __inline __m128 __DEFAULT_FN_ATTRS 872 _mm_permutevar_ps(__m128 __a, __m128i __c) 873 { 874 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 875 } 876 877 /// \brief Copies the values stored in a 256-bit vector of [8 x float] as 878 /// specified by the 256-bit integer vector operand. 879 /// 880 /// \headerfile <x86intrin.h> 881 /// 882 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. 883 /// 884 /// \param __a 885 /// A 256-bit vector of [8 x float]. 886 /// \param __c 887 /// A 256-bit integer vector operand specifying how the values are to be 888 /// copied. 889 /// Bits [1:0]: 890 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 891 /// returned vector 892 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 893 /// returned vector 894 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 895 /// returned vector 896 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 897 /// returned vector 898 /// Bits [33:32]: 899 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 900 /// returned vector 901 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 902 /// returned vector 903 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 904 /// returned vector 905 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 906 /// returned vector 907 /// Bits [65:64]: 908 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 909 /// returned vector 910 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 911 /// returned vector 912 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 913 /// returned vector 914 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 915 /// returned vector 916 /// Bits [97:96]: 917 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 918 /// returned vector 919 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 920 /// returned vector 921 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 922 /// returned vector 923 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 924 /// returned vector 925 /// Bits [129:128]: 926 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 927 /// returned vector 928 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 929 /// returned vector 930 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 931 /// returned vector 932 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 933 /// returned vector 934 /// Bits [161:160]: 935 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 936 /// returned vector 937 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 938 /// returned vector 939 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 940 /// returned vector 941 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 942 /// returned vector 943 /// Bits [193:192]: 944 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 945 /// returned vector 946 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 947 /// returned vector 948 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 949 /// returned vector 950 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 951 /// returned vector 952 /// Bits [225:224]: 953 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 954 /// returned vector 955 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 956 /// returned vector 957 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 958 /// returned vector 959 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 960 /// returned vector 961 /// \returns A 256-bit vector of [8 x float] containing the copied values. 962 static __inline __m256 __DEFAULT_FN_ATTRS 963 _mm256_permutevar_ps(__m256 __a, __m256i __c) 964 { 965 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 966 } 967 968 /// \brief Copies the values stored in a 128-bit vector of [2 x double] as 969 /// specified by the immediate integer operand. 970 /// 971 /// \headerfile <x86intrin.h> 972 /// 973 /// \code 974 /// __m128d _mm_permute_pd(__m128d A, const int C); 975 /// \endcode 976 /// 977 /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. 978 /// 979 /// \param A 980 /// A 128-bit vector of [2 x double]. 981 /// \param C 982 /// An immediate integer operand specifying how the values are to be copied. 983 /// Bit [0]: 984 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the 985 /// returned vector 986 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 987 /// returned vector 988 /// Bit [1]: 989 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 990 /// returned vector 991 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 992 /// returned vector 993 /// \returns A 128-bit vector of [2 x double] containing the copied values. 994 #define _mm_permute_pd(A, C) __extension__ ({ \ 995 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ 996 (__v2df)_mm_setzero_pd(), \ 997 (C) & 0x1, ((C) & 0x2) >> 1); }) 998 999 /// \brief Copies the values stored in a 256-bit vector of [4 x double] as 1000 /// specified by the immediate integer operand. 1001 /// 1002 /// \headerfile <x86intrin.h> 1003 /// 1004 /// \code 1005 /// __m256d _mm256_permute_pd(__m256d A, const int C); 1006 /// \endcode 1007 /// 1008 /// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. 1009 /// 1010 /// \param A 1011 /// A 256-bit vector of [4 x double]. 1012 /// \param C 1013 /// An immediate integer operand specifying how the values are to be copied. 1014 /// Bit [0]: 1015 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the 1016 /// returned vector 1017 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1018 /// returned vector 1019 /// Bit [1]: 1020 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1021 /// returned vector 1022 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1023 /// returned vector 1024 /// Bit [2]: 1025 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1026 /// returned vector 1027 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1028 /// returned vector 1029 /// Bit [3]: 1030 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1031 /// returned vector 1032 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1033 /// returned vector 1034 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1035 #define _mm256_permute_pd(A, C) __extension__ ({ \ 1036 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ 1037 (__v4df)_mm256_setzero_pd(), \ 1038 (C) & 0x1, ((C) & 0x2) >> 1, \ 1039 2 + (((C) & 0x4) >> 2), \ 1040 2 + (((C) & 0x8) >> 3)); }) 1041 1042 /// \brief Copies the values stored in a 128-bit vector of [4 x float] as 1043 /// specified by the immediate integer operand. 1044 /// 1045 /// \headerfile <x86intrin.h> 1046 /// 1047 /// \code 1048 /// __m128 _mm_permute_ps(__m128 A, const int C); 1049 /// \endcode 1050 /// 1051 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. 1052 /// 1053 /// \param A 1054 /// A 128-bit vector of [4 x float]. 1055 /// \param C 1056 /// An immediate integer operand specifying how the values are to be copied. 1057 /// Bits [1:0]: 1058 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1059 /// returned vector 1060 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1061 /// returned vector 1062 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1063 /// returned vector 1064 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1065 /// returned vector 1066 /// Bits [3:2]: 1067 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1068 /// returned vector 1069 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1070 /// returned vector 1071 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1072 /// returned vector 1073 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1074 /// returned vector 1075 /// Bits [5:4]: 1076 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1077 /// returned vector 1078 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1079 /// returned vector 1080 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1081 /// returned vector 1082 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1083 /// returned vector 1084 /// Bits [7:6]: 1085 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1086 /// returned vector 1087 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1088 /// returned vector 1089 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1090 /// returned vector 1091 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1092 /// returned vector 1093 /// \returns A 128-bit vector of [4 x float] containing the copied values. 1094 #define _mm_permute_ps(A, C) __extension__ ({ \ 1095 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ 1096 (__v4sf)_mm_setzero_ps(), \ 1097 (C) & 0x3, ((C) & 0xc) >> 2, \ 1098 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) 1099 1100 /// \brief Copies the values stored in a 256-bit vector of [8 x float] as 1101 /// specified by the immediate integer operand. 1102 /// 1103 /// \headerfile <x86intrin.h> 1104 /// 1105 /// \code 1106 /// __m256 _mm256_permute_ps(__m256 A, const int C); 1107 /// \endcode 1108 /// 1109 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. 1110 /// 1111 /// \param A 1112 /// A 256-bit vector of [8 x float]. 1113 /// \param C 1114 /// An immediate integer operand specifying how the values are to be copied. 1115 /// Bits [1:0]: 1116 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1117 /// returned vector 1118 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1119 /// returned vector 1120 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1121 /// returned vector 1122 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1123 /// returned vector 1124 /// Bits [3:2]: 1125 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1126 /// returned vector 1127 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1128 /// returned vector 1129 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1130 /// returned vector 1131 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1132 /// returned vector 1133 /// Bits [5:4]: 1134 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1135 /// returned vector 1136 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1137 /// returned vector 1138 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1139 /// returned vector 1140 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1141 /// returned vector 1142 /// Bits [7:6]: 1143 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1144 /// returned vector 1145 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1146 /// returned vector 1147 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1148 /// returned vector 1149 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1150 /// returned vector 1151 /// Bits [1:0]: 1152 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1153 /// returned vector 1154 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1155 /// returned vector 1156 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1157 /// returned vector 1158 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1159 /// returned vector 1160 /// Bits [3:2]: 1161 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1162 /// returned vector 1163 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1164 /// returned vector 1165 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1166 /// returned vector 1167 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1168 /// returned vector 1169 /// Bits [5:4]: 1170 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1171 /// returned vector 1172 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1173 /// returned vector 1174 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1175 /// returned vector 1176 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1177 /// returned vector 1178 /// Bits [7:6]: 1179 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1180 /// returned vector 1181 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1182 /// returned vector 1183 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1184 /// returned vector 1185 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1186 /// returned vector 1187 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1188 #define _mm256_permute_ps(A, C) __extension__ ({ \ 1189 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ 1190 (__v8sf)_mm256_setzero_ps(), \ 1191 (C) & 0x3, ((C) & 0xc) >> 2, \ 1192 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \ 1193 4 + (((C) & 0x03) >> 0), \ 1194 4 + (((C) & 0x0c) >> 2), \ 1195 4 + (((C) & 0x30) >> 4), \ 1196 4 + (((C) & 0xc0) >> 6)); }) 1197 1198 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 1199 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1200 (__v4df)(__m256d)(V2), (M)); }) 1201 1202 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 1203 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1204 (__v8sf)(__m256)(V2), (M)); }) 1205 1206 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 1207 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1208 (__v8si)(__m256i)(V2), (M)); }) 1209 1210 /* Vector Blend */ 1211 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 1212 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ 1213 (__v4df)(__m256d)(V2), \ 1214 (((M) & 0x01) ? 4 : 0), \ 1215 (((M) & 0x02) ? 5 : 1), \ 1216 (((M) & 0x04) ? 6 : 2), \ 1217 (((M) & 0x08) ? 7 : 3)); }) 1218 1219 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 1220 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ 1221 (__v8sf)(__m256)(V2), \ 1222 (((M) & 0x01) ? 8 : 0), \ 1223 (((M) & 0x02) ? 9 : 1), \ 1224 (((M) & 0x04) ? 10 : 2), \ 1225 (((M) & 0x08) ? 11 : 3), \ 1226 (((M) & 0x10) ? 12 : 4), \ 1227 (((M) & 0x20) ? 13 : 5), \ 1228 (((M) & 0x40) ? 14 : 6), \ 1229 (((M) & 0x80) ? 15 : 7)); }) 1230 1231 static __inline __m256d __DEFAULT_FN_ATTRS 1232 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1233 { 1234 return (__m256d)__builtin_ia32_blendvpd256( 1235 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1236 } 1237 1238 static __inline __m256 __DEFAULT_FN_ATTRS 1239 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1240 { 1241 return (__m256)__builtin_ia32_blendvps256( 1242 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1243 } 1244 1245 /* Vector Dot Product */ 1246 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 1247 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1248 (__v8sf)(__m256)(V2), (M)); }) 1249 1250 /* Vector shuffle */ 1251 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 1252 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ 1253 (__v8sf)(__m256)(b), \ 1254 (mask) & 0x3, \ 1255 ((mask) & 0xc) >> 2, \ 1256 (((mask) & 0x30) >> 4) + 8, \ 1257 (((mask) & 0xc0) >> 6) + 8, \ 1258 ((mask) & 0x3) + 4, \ 1259 (((mask) & 0xc) >> 2) + 4, \ 1260 (((mask) & 0x30) >> 4) + 12, \ 1261 (((mask) & 0xc0) >> 6) + 12); }) 1262 1263 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 1264 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ 1265 (__v4df)(__m256d)(b), \ 1266 (mask) & 0x1, \ 1267 (((mask) & 0x2) >> 1) + 4, \ 1268 (((mask) & 0x4) >> 2) + 2, \ 1269 (((mask) & 0x8) >> 3) + 6); }) 1270 1271 /* Compare */ 1272 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 1273 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 1274 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 1275 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 1276 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 1277 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 1278 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 1279 #define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 1280 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1281 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 1282 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1283 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1284 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1285 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1286 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1287 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1288 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1289 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1290 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1291 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1292 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1293 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1294 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 1295 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1296 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1297 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 1298 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1299 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1300 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1301 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1302 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1303 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1304 1305 #define _mm_cmp_pd(a, b, c) __extension__ ({ \ 1306 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ 1307 (__v2df)(__m128d)(b), (c)); }) 1308 1309 #define _mm_cmp_ps(a, b, c) __extension__ ({ \ 1310 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ 1311 (__v4sf)(__m128)(b), (c)); }) 1312 1313 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 1314 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1315 (__v4df)(__m256d)(b), (c)); }) 1316 1317 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 1318 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1319 (__v8sf)(__m256)(b), (c)); }) 1320 1321 #define _mm_cmp_sd(a, b, c) __extension__ ({ \ 1322 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ 1323 (__v2df)(__m128d)(b), (c)); }) 1324 1325 #define _mm_cmp_ss(a, b, c) __extension__ ({ \ 1326 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ 1327 (__v4sf)(__m128)(b), (c)); }) 1328 1329 static __inline int __DEFAULT_FN_ATTRS 1330 _mm256_extract_epi32(__m256i __a, const int __imm) 1331 { 1332 __v8si __b = (__v8si)__a; 1333 return __b[__imm & 7]; 1334 } 1335 1336 static __inline int __DEFAULT_FN_ATTRS 1337 _mm256_extract_epi16(__m256i __a, const int __imm) 1338 { 1339 __v16hi __b = (__v16hi)__a; 1340 return __b[__imm & 15]; 1341 } 1342 1343 static __inline int __DEFAULT_FN_ATTRS 1344 _mm256_extract_epi8(__m256i __a, const int __imm) 1345 { 1346 __v32qi __b = (__v32qi)__a; 1347 return __b[__imm & 31]; 1348 } 1349 1350 #ifdef __x86_64__ 1351 static __inline long long __DEFAULT_FN_ATTRS 1352 _mm256_extract_epi64(__m256i __a, const int __imm) 1353 { 1354 __v4di __b = (__v4di)__a; 1355 return __b[__imm & 3]; 1356 } 1357 #endif 1358 1359 static __inline __m256i __DEFAULT_FN_ATTRS 1360 _mm256_insert_epi32(__m256i __a, int __b, int const __imm) 1361 { 1362 __v8si __c = (__v8si)__a; 1363 __c[__imm & 7] = __b; 1364 return (__m256i)__c; 1365 } 1366 1367 static __inline __m256i __DEFAULT_FN_ATTRS 1368 _mm256_insert_epi16(__m256i __a, int __b, int const __imm) 1369 { 1370 __v16hi __c = (__v16hi)__a; 1371 __c[__imm & 15] = __b; 1372 return (__m256i)__c; 1373 } 1374 1375 static __inline __m256i __DEFAULT_FN_ATTRS 1376 _mm256_insert_epi8(__m256i __a, int __b, int const __imm) 1377 { 1378 __v32qi __c = (__v32qi)__a; 1379 __c[__imm & 31] = __b; 1380 return (__m256i)__c; 1381 } 1382 1383 #ifdef __x86_64__ 1384 static __inline __m256i __DEFAULT_FN_ATTRS 1385 _mm256_insert_epi64(__m256i __a, long long __b, int const __imm) 1386 { 1387 __v4di __c = (__v4di)__a; 1388 __c[__imm & 3] = __b; 1389 return (__m256i)__c; 1390 } 1391 #endif 1392 1393 /* Conversion */ 1394 static __inline __m256d __DEFAULT_FN_ATTRS 1395 _mm256_cvtepi32_pd(__m128i __a) 1396 { 1397 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a); 1398 } 1399 1400 static __inline __m256 __DEFAULT_FN_ATTRS 1401 _mm256_cvtepi32_ps(__m256i __a) 1402 { 1403 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); 1404 } 1405 1406 static __inline __m128 __DEFAULT_FN_ATTRS 1407 _mm256_cvtpd_ps(__m256d __a) 1408 { 1409 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 1410 } 1411 1412 static __inline __m256i __DEFAULT_FN_ATTRS 1413 _mm256_cvtps_epi32(__m256 __a) 1414 { 1415 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 1416 } 1417 1418 static __inline __m256d __DEFAULT_FN_ATTRS 1419 _mm256_cvtps_pd(__m128 __a) 1420 { 1421 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a); 1422 } 1423 1424 static __inline __m128i __DEFAULT_FN_ATTRS 1425 _mm256_cvttpd_epi32(__m256d __a) 1426 { 1427 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 1428 } 1429 1430 static __inline __m128i __DEFAULT_FN_ATTRS 1431 _mm256_cvtpd_epi32(__m256d __a) 1432 { 1433 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 1434 } 1435 1436 static __inline __m256i __DEFAULT_FN_ATTRS 1437 _mm256_cvttps_epi32(__m256 __a) 1438 { 1439 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 1440 } 1441 1442 /* Vector replicate */ 1443 static __inline __m256 __DEFAULT_FN_ATTRS 1444 _mm256_movehdup_ps(__m256 __a) 1445 { 1446 return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7); 1447 } 1448 1449 static __inline __m256 __DEFAULT_FN_ATTRS 1450 _mm256_moveldup_ps(__m256 __a) 1451 { 1452 return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6); 1453 } 1454 1455 static __inline __m256d __DEFAULT_FN_ATTRS 1456 _mm256_movedup_pd(__m256d __a) 1457 { 1458 return __builtin_shufflevector(__a, __a, 0, 0, 2, 2); 1459 } 1460 1461 /* Unpack and Interleave */ 1462 static __inline __m256d __DEFAULT_FN_ATTRS 1463 _mm256_unpackhi_pd(__m256d __a, __m256d __b) 1464 { 1465 return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2); 1466 } 1467 1468 static __inline __m256d __DEFAULT_FN_ATTRS 1469 _mm256_unpacklo_pd(__m256d __a, __m256d __b) 1470 { 1471 return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2); 1472 } 1473 1474 static __inline __m256 __DEFAULT_FN_ATTRS 1475 _mm256_unpackhi_ps(__m256 __a, __m256 __b) 1476 { 1477 return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 1478 } 1479 1480 static __inline __m256 __DEFAULT_FN_ATTRS 1481 _mm256_unpacklo_ps(__m256 __a, __m256 __b) 1482 { 1483 return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 1484 } 1485 1486 /* Bit Test */ 1487 static __inline int __DEFAULT_FN_ATTRS 1488 _mm_testz_pd(__m128d __a, __m128d __b) 1489 { 1490 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 1491 } 1492 1493 static __inline int __DEFAULT_FN_ATTRS 1494 _mm_testc_pd(__m128d __a, __m128d __b) 1495 { 1496 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 1497 } 1498 1499 static __inline int __DEFAULT_FN_ATTRS 1500 _mm_testnzc_pd(__m128d __a, __m128d __b) 1501 { 1502 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 1503 } 1504 1505 static __inline int __DEFAULT_FN_ATTRS 1506 _mm_testz_ps(__m128 __a, __m128 __b) 1507 { 1508 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 1509 } 1510 1511 static __inline int __DEFAULT_FN_ATTRS 1512 _mm_testc_ps(__m128 __a, __m128 __b) 1513 { 1514 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 1515 } 1516 1517 static __inline int __DEFAULT_FN_ATTRS 1518 _mm_testnzc_ps(__m128 __a, __m128 __b) 1519 { 1520 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 1521 } 1522 1523 static __inline int __DEFAULT_FN_ATTRS 1524 _mm256_testz_pd(__m256d __a, __m256d __b) 1525 { 1526 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 1527 } 1528 1529 static __inline int __DEFAULT_FN_ATTRS 1530 _mm256_testc_pd(__m256d __a, __m256d __b) 1531 { 1532 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 1533 } 1534 1535 static __inline int __DEFAULT_FN_ATTRS 1536 _mm256_testnzc_pd(__m256d __a, __m256d __b) 1537 { 1538 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 1539 } 1540 1541 static __inline int __DEFAULT_FN_ATTRS 1542 _mm256_testz_ps(__m256 __a, __m256 __b) 1543 { 1544 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 1545 } 1546 1547 static __inline int __DEFAULT_FN_ATTRS 1548 _mm256_testc_ps(__m256 __a, __m256 __b) 1549 { 1550 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 1551 } 1552 1553 static __inline int __DEFAULT_FN_ATTRS 1554 _mm256_testnzc_ps(__m256 __a, __m256 __b) 1555 { 1556 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 1557 } 1558 1559 static __inline int __DEFAULT_FN_ATTRS 1560 _mm256_testz_si256(__m256i __a, __m256i __b) 1561 { 1562 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 1563 } 1564 1565 static __inline int __DEFAULT_FN_ATTRS 1566 _mm256_testc_si256(__m256i __a, __m256i __b) 1567 { 1568 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 1569 } 1570 1571 static __inline int __DEFAULT_FN_ATTRS 1572 _mm256_testnzc_si256(__m256i __a, __m256i __b) 1573 { 1574 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 1575 } 1576 1577 /* Vector extract sign mask */ 1578 static __inline int __DEFAULT_FN_ATTRS 1579 _mm256_movemask_pd(__m256d __a) 1580 { 1581 return __builtin_ia32_movmskpd256((__v4df)__a); 1582 } 1583 1584 static __inline int __DEFAULT_FN_ATTRS 1585 _mm256_movemask_ps(__m256 __a) 1586 { 1587 return __builtin_ia32_movmskps256((__v8sf)__a); 1588 } 1589 1590 /* Vector __zero */ 1591 static __inline void __DEFAULT_FN_ATTRS 1592 _mm256_zeroall(void) 1593 { 1594 __builtin_ia32_vzeroall(); 1595 } 1596 1597 static __inline void __DEFAULT_FN_ATTRS 1598 _mm256_zeroupper(void) 1599 { 1600 __builtin_ia32_vzeroupper(); 1601 } 1602 1603 /* Vector load with broadcast */ 1604 static __inline __m128 __DEFAULT_FN_ATTRS 1605 _mm_broadcast_ss(float const *__a) 1606 { 1607 float __f = *__a; 1608 return (__m128)(__v4sf){ __f, __f, __f, __f }; 1609 } 1610 1611 static __inline __m256d __DEFAULT_FN_ATTRS 1612 _mm256_broadcast_sd(double const *__a) 1613 { 1614 double __d = *__a; 1615 return (__m256d)(__v4df){ __d, __d, __d, __d }; 1616 } 1617 1618 static __inline __m256 __DEFAULT_FN_ATTRS 1619 _mm256_broadcast_ss(float const *__a) 1620 { 1621 float __f = *__a; 1622 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 1623 } 1624 1625 static __inline __m256d __DEFAULT_FN_ATTRS 1626 _mm256_broadcast_pd(__m128d const *__a) 1627 { 1628 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a); 1629 } 1630 1631 static __inline __m256 __DEFAULT_FN_ATTRS 1632 _mm256_broadcast_ps(__m128 const *__a) 1633 { 1634 return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a); 1635 } 1636 1637 /* SIMD load ops */ 1638 static __inline __m256d __DEFAULT_FN_ATTRS 1639 _mm256_load_pd(double const *__p) 1640 { 1641 return *(__m256d *)__p; 1642 } 1643 1644 static __inline __m256 __DEFAULT_FN_ATTRS 1645 _mm256_load_ps(float const *__p) 1646 { 1647 return *(__m256 *)__p; 1648 } 1649 1650 static __inline __m256d __DEFAULT_FN_ATTRS 1651 _mm256_loadu_pd(double const *__p) 1652 { 1653 struct __loadu_pd { 1654 __m256d __v; 1655 } __attribute__((__packed__, __may_alias__)); 1656 return ((struct __loadu_pd*)__p)->__v; 1657 } 1658 1659 static __inline __m256 __DEFAULT_FN_ATTRS 1660 _mm256_loadu_ps(float const *__p) 1661 { 1662 struct __loadu_ps { 1663 __m256 __v; 1664 } __attribute__((__packed__, __may_alias__)); 1665 return ((struct __loadu_ps*)__p)->__v; 1666 } 1667 1668 static __inline __m256i __DEFAULT_FN_ATTRS 1669 _mm256_load_si256(__m256i const *__p) 1670 { 1671 return *__p; 1672 } 1673 1674 static __inline __m256i __DEFAULT_FN_ATTRS 1675 _mm256_loadu_si256(__m256i const *__p) 1676 { 1677 struct __loadu_si256 { 1678 __m256i __v; 1679 } __attribute__((__packed__, __may_alias__)); 1680 return ((struct __loadu_si256*)__p)->__v; 1681 } 1682 1683 static __inline __m256i __DEFAULT_FN_ATTRS 1684 _mm256_lddqu_si256(__m256i const *__p) 1685 { 1686 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 1687 } 1688 1689 /* SIMD store ops */ 1690 static __inline void __DEFAULT_FN_ATTRS 1691 _mm256_store_pd(double *__p, __m256d __a) 1692 { 1693 *(__m256d *)__p = __a; 1694 } 1695 1696 static __inline void __DEFAULT_FN_ATTRS 1697 _mm256_store_ps(float *__p, __m256 __a) 1698 { 1699 *(__m256 *)__p = __a; 1700 } 1701 1702 static __inline void __DEFAULT_FN_ATTRS 1703 _mm256_storeu_pd(double *__p, __m256d __a) 1704 { 1705 __builtin_ia32_storeupd256(__p, (__v4df)__a); 1706 } 1707 1708 static __inline void __DEFAULT_FN_ATTRS 1709 _mm256_storeu_ps(float *__p, __m256 __a) 1710 { 1711 __builtin_ia32_storeups256(__p, (__v8sf)__a); 1712 } 1713 1714 static __inline void __DEFAULT_FN_ATTRS 1715 _mm256_store_si256(__m256i *__p, __m256i __a) 1716 { 1717 *__p = __a; 1718 } 1719 1720 static __inline void __DEFAULT_FN_ATTRS 1721 _mm256_storeu_si256(__m256i *__p, __m256i __a) 1722 { 1723 __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a); 1724 } 1725 1726 /* Conditional load ops */ 1727 static __inline __m128d __DEFAULT_FN_ATTRS 1728 _mm_maskload_pd(double const *__p, __m128i __m) 1729 { 1730 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 1731 } 1732 1733 static __inline __m256d __DEFAULT_FN_ATTRS 1734 _mm256_maskload_pd(double const *__p, __m256i __m) 1735 { 1736 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 1737 (__v4di)__m); 1738 } 1739 1740 static __inline __m128 __DEFAULT_FN_ATTRS 1741 _mm_maskload_ps(float const *__p, __m128i __m) 1742 { 1743 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 1744 } 1745 1746 static __inline __m256 __DEFAULT_FN_ATTRS 1747 _mm256_maskload_ps(float const *__p, __m256i __m) 1748 { 1749 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 1750 } 1751 1752 /* Conditional store ops */ 1753 static __inline void __DEFAULT_FN_ATTRS 1754 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 1755 { 1756 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 1757 } 1758 1759 static __inline void __DEFAULT_FN_ATTRS 1760 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 1761 { 1762 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 1763 } 1764 1765 static __inline void __DEFAULT_FN_ATTRS 1766 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 1767 { 1768 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 1769 } 1770 1771 static __inline void __DEFAULT_FN_ATTRS 1772 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 1773 { 1774 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 1775 } 1776 1777 /* Cacheability support ops */ 1778 static __inline void __DEFAULT_FN_ATTRS 1779 _mm256_stream_si256(__m256i *__a, __m256i __b) 1780 { 1781 __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b); 1782 } 1783 1784 static __inline void __DEFAULT_FN_ATTRS 1785 _mm256_stream_pd(double *__a, __m256d __b) 1786 { 1787 __builtin_ia32_movntpd256(__a, (__v4df)__b); 1788 } 1789 1790 static __inline void __DEFAULT_FN_ATTRS 1791 _mm256_stream_ps(float *__p, __m256 __a) 1792 { 1793 __builtin_ia32_movntps256(__p, (__v8sf)__a); 1794 } 1795 1796 /* Create vectors */ 1797 static __inline__ __m256d __DEFAULT_FN_ATTRS 1798 _mm256_undefined_pd() 1799 { 1800 return (__m256d)__builtin_ia32_undef256(); 1801 } 1802 1803 static __inline__ __m256 __DEFAULT_FN_ATTRS 1804 _mm256_undefined_ps() 1805 { 1806 return (__m256)__builtin_ia32_undef256(); 1807 } 1808 1809 static __inline__ __m256i __DEFAULT_FN_ATTRS 1810 _mm256_undefined_si256() 1811 { 1812 return (__m256i)__builtin_ia32_undef256(); 1813 } 1814 1815 static __inline __m256d __DEFAULT_FN_ATTRS 1816 _mm256_set_pd(double __a, double __b, double __c, double __d) 1817 { 1818 return (__m256d){ __d, __c, __b, __a }; 1819 } 1820 1821 static __inline __m256 __DEFAULT_FN_ATTRS 1822 _mm256_set_ps(float __a, float __b, float __c, float __d, 1823 float __e, float __f, float __g, float __h) 1824 { 1825 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 1826 } 1827 1828 static __inline __m256i __DEFAULT_FN_ATTRS 1829 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 1830 int __i4, int __i5, int __i6, int __i7) 1831 { 1832 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 1833 } 1834 1835 static __inline __m256i __DEFAULT_FN_ATTRS 1836 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 1837 short __w11, short __w10, short __w09, short __w08, 1838 short __w07, short __w06, short __w05, short __w04, 1839 short __w03, short __w02, short __w01, short __w00) 1840 { 1841 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 1842 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 1843 } 1844 1845 static __inline __m256i __DEFAULT_FN_ATTRS 1846 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 1847 char __b27, char __b26, char __b25, char __b24, 1848 char __b23, char __b22, char __b21, char __b20, 1849 char __b19, char __b18, char __b17, char __b16, 1850 char __b15, char __b14, char __b13, char __b12, 1851 char __b11, char __b10, char __b09, char __b08, 1852 char __b07, char __b06, char __b05, char __b04, 1853 char __b03, char __b02, char __b01, char __b00) 1854 { 1855 return (__m256i)(__v32qi){ 1856 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 1857 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 1858 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 1859 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 1860 }; 1861 } 1862 1863 static __inline __m256i __DEFAULT_FN_ATTRS 1864 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 1865 { 1866 return (__m256i)(__v4di){ __d, __c, __b, __a }; 1867 } 1868 1869 /* Create vectors with elements in reverse order */ 1870 static __inline __m256d __DEFAULT_FN_ATTRS 1871 _mm256_setr_pd(double __a, double __b, double __c, double __d) 1872 { 1873 return (__m256d){ __a, __b, __c, __d }; 1874 } 1875 1876 static __inline __m256 __DEFAULT_FN_ATTRS 1877 _mm256_setr_ps(float __a, float __b, float __c, float __d, 1878 float __e, float __f, float __g, float __h) 1879 { 1880 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; 1881 } 1882 1883 static __inline __m256i __DEFAULT_FN_ATTRS 1884 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 1885 int __i4, int __i5, int __i6, int __i7) 1886 { 1887 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; 1888 } 1889 1890 static __inline __m256i __DEFAULT_FN_ATTRS 1891 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 1892 short __w11, short __w10, short __w09, short __w08, 1893 short __w07, short __w06, short __w05, short __w04, 1894 short __w03, short __w02, short __w01, short __w00) 1895 { 1896 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, 1897 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; 1898 } 1899 1900 static __inline __m256i __DEFAULT_FN_ATTRS 1901 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 1902 char __b27, char __b26, char __b25, char __b24, 1903 char __b23, char __b22, char __b21, char __b20, 1904 char __b19, char __b18, char __b17, char __b16, 1905 char __b15, char __b14, char __b13, char __b12, 1906 char __b11, char __b10, char __b09, char __b08, 1907 char __b07, char __b06, char __b05, char __b04, 1908 char __b03, char __b02, char __b01, char __b00) 1909 { 1910 return (__m256i)(__v32qi){ 1911 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, 1912 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, 1913 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, 1914 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; 1915 } 1916 1917 static __inline __m256i __DEFAULT_FN_ATTRS 1918 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 1919 { 1920 return (__m256i)(__v4di){ __a, __b, __c, __d }; 1921 } 1922 1923 /* Create vectors with repeated elements */ 1924 static __inline __m256d __DEFAULT_FN_ATTRS 1925 _mm256_set1_pd(double __w) 1926 { 1927 return (__m256d){ __w, __w, __w, __w }; 1928 } 1929 1930 static __inline __m256 __DEFAULT_FN_ATTRS 1931 _mm256_set1_ps(float __w) 1932 { 1933 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; 1934 } 1935 1936 static __inline __m256i __DEFAULT_FN_ATTRS 1937 _mm256_set1_epi32(int __i) 1938 { 1939 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; 1940 } 1941 1942 static __inline __m256i __DEFAULT_FN_ATTRS 1943 _mm256_set1_epi16(short __w) 1944 { 1945 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, 1946 __w, __w, __w, __w, __w, __w }; 1947 } 1948 1949 static __inline __m256i __DEFAULT_FN_ATTRS 1950 _mm256_set1_epi8(char __b) 1951 { 1952 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 1953 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 1954 __b, __b, __b, __b, __b, __b, __b }; 1955 } 1956 1957 static __inline __m256i __DEFAULT_FN_ATTRS 1958 _mm256_set1_epi64x(long long __q) 1959 { 1960 return (__m256i)(__v4di){ __q, __q, __q, __q }; 1961 } 1962 1963 /* Create __zeroed vectors */ 1964 static __inline __m256d __DEFAULT_FN_ATTRS 1965 _mm256_setzero_pd(void) 1966 { 1967 return (__m256d){ 0, 0, 0, 0 }; 1968 } 1969 1970 static __inline __m256 __DEFAULT_FN_ATTRS 1971 _mm256_setzero_ps(void) 1972 { 1973 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1974 } 1975 1976 static __inline __m256i __DEFAULT_FN_ATTRS 1977 _mm256_setzero_si256(void) 1978 { 1979 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 1980 } 1981 1982 /* Cast between vector types */ 1983 static __inline __m256 __DEFAULT_FN_ATTRS 1984 _mm256_castpd_ps(__m256d __a) 1985 { 1986 return (__m256)__a; 1987 } 1988 1989 static __inline __m256i __DEFAULT_FN_ATTRS 1990 _mm256_castpd_si256(__m256d __a) 1991 { 1992 return (__m256i)__a; 1993 } 1994 1995 static __inline __m256d __DEFAULT_FN_ATTRS 1996 _mm256_castps_pd(__m256 __a) 1997 { 1998 return (__m256d)__a; 1999 } 2000 2001 static __inline __m256i __DEFAULT_FN_ATTRS 2002 _mm256_castps_si256(__m256 __a) 2003 { 2004 return (__m256i)__a; 2005 } 2006 2007 static __inline __m256 __DEFAULT_FN_ATTRS 2008 _mm256_castsi256_ps(__m256i __a) 2009 { 2010 return (__m256)__a; 2011 } 2012 2013 static __inline __m256d __DEFAULT_FN_ATTRS 2014 _mm256_castsi256_pd(__m256i __a) 2015 { 2016 return (__m256d)__a; 2017 } 2018 2019 static __inline __m128d __DEFAULT_FN_ATTRS 2020 _mm256_castpd256_pd128(__m256d __a) 2021 { 2022 return __builtin_shufflevector(__a, __a, 0, 1); 2023 } 2024 2025 static __inline __m128 __DEFAULT_FN_ATTRS 2026 _mm256_castps256_ps128(__m256 __a) 2027 { 2028 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); 2029 } 2030 2031 static __inline __m128i __DEFAULT_FN_ATTRS 2032 _mm256_castsi256_si128(__m256i __a) 2033 { 2034 return __builtin_shufflevector(__a, __a, 0, 1); 2035 } 2036 2037 static __inline __m256d __DEFAULT_FN_ATTRS 2038 _mm256_castpd128_pd256(__m128d __a) 2039 { 2040 return __builtin_shufflevector(__a, __a, 0, 1, -1, -1); 2041 } 2042 2043 static __inline __m256 __DEFAULT_FN_ATTRS 2044 _mm256_castps128_ps256(__m128 __a) 2045 { 2046 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1); 2047 } 2048 2049 static __inline __m256i __DEFAULT_FN_ATTRS 2050 _mm256_castsi128_si256(__m128i __a) 2051 { 2052 return __builtin_shufflevector(__a, __a, 0, 1, -1, -1); 2053 } 2054 2055 /* 2056 Vector insert. 2057 We use macros rather than inlines because we only want to accept 2058 invocations where the immediate M is a constant expression. 2059 */ 2060 #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ 2061 (__m256)__builtin_shufflevector( \ 2062 (__v8sf)(__m256)(V1), \ 2063 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ 2064 (((M) & 1) ? 0 : 8), \ 2065 (((M) & 1) ? 1 : 9), \ 2066 (((M) & 1) ? 2 : 10), \ 2067 (((M) & 1) ? 3 : 11), \ 2068 (((M) & 1) ? 8 : 4), \ 2069 (((M) & 1) ? 9 : 5), \ 2070 (((M) & 1) ? 10 : 6), \ 2071 (((M) & 1) ? 11 : 7) );}) 2072 2073 #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ 2074 (__m256d)__builtin_shufflevector( \ 2075 (__v4df)(__m256d)(V1), \ 2076 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ 2077 (((M) & 1) ? 0 : 4), \ 2078 (((M) & 1) ? 1 : 5), \ 2079 (((M) & 1) ? 4 : 2), \ 2080 (((M) & 1) ? 5 : 3) );}) 2081 2082 #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ 2083 (__m256i)__builtin_shufflevector( \ 2084 (__v4di)(__m256i)(V1), \ 2085 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ 2086 (((M) & 1) ? 0 : 4), \ 2087 (((M) & 1) ? 1 : 5), \ 2088 (((M) & 1) ? 4 : 2), \ 2089 (((M) & 1) ? 5 : 3) );}) 2090 2091 /* 2092 Vector extract. 2093 We use macros rather than inlines because we only want to accept 2094 invocations where the immediate M is a constant expression. 2095 */ 2096 #define _mm256_extractf128_ps(V, M) __extension__ ({ \ 2097 (__m128)__builtin_shufflevector( \ 2098 (__v8sf)(__m256)(V), \ 2099 (__v8sf)(_mm256_setzero_ps()), \ 2100 (((M) & 1) ? 4 : 0), \ 2101 (((M) & 1) ? 5 : 1), \ 2102 (((M) & 1) ? 6 : 2), \ 2103 (((M) & 1) ? 7 : 3) );}) 2104 2105 #define _mm256_extractf128_pd(V, M) __extension__ ({ \ 2106 (__m128d)__builtin_shufflevector( \ 2107 (__v4df)(__m256d)(V), \ 2108 (__v4df)(_mm256_setzero_pd()), \ 2109 (((M) & 1) ? 2 : 0), \ 2110 (((M) & 1) ? 3 : 1) );}) 2111 2112 #define _mm256_extractf128_si256(V, M) __extension__ ({ \ 2113 (__m128i)__builtin_shufflevector( \ 2114 (__v4di)(__m256i)(V), \ 2115 (__v4di)(_mm256_setzero_si256()), \ 2116 (((M) & 1) ? 2 : 0), \ 2117 (((M) & 1) ? 3 : 1) );}) 2118 2119 /* SIMD load ops (unaligned) */ 2120 static __inline __m256 __DEFAULT_FN_ATTRS 2121 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 2122 { 2123 struct __loadu_ps { 2124 __m128 __v; 2125 } __attribute__((__packed__, __may_alias__)); 2126 2127 __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v); 2128 return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1); 2129 } 2130 2131 static __inline __m256d __DEFAULT_FN_ATTRS 2132 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 2133 { 2134 struct __loadu_pd { 2135 __m128d __v; 2136 } __attribute__((__packed__, __may_alias__)); 2137 2138 __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v); 2139 return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1); 2140 } 2141 2142 static __inline __m256i __DEFAULT_FN_ATTRS 2143 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) 2144 { 2145 struct __loadu_si128 { 2146 __m128i __v; 2147 } __attribute__((__packed__, __may_alias__)); 2148 __m256i __v256 = _mm256_castsi128_si256( 2149 ((struct __loadu_si128*)__addr_lo)->__v); 2150 return _mm256_insertf128_si256(__v256, 2151 ((struct __loadu_si128*)__addr_hi)->__v, 1); 2152 } 2153 2154 /* SIMD store ops (unaligned) */ 2155 static __inline void __DEFAULT_FN_ATTRS 2156 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 2157 { 2158 __m128 __v128; 2159 2160 __v128 = _mm256_castps256_ps128(__a); 2161 __builtin_ia32_storeups(__addr_lo, __v128); 2162 __v128 = _mm256_extractf128_ps(__a, 1); 2163 __builtin_ia32_storeups(__addr_hi, __v128); 2164 } 2165 2166 static __inline void __DEFAULT_FN_ATTRS 2167 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 2168 { 2169 __m128d __v128; 2170 2171 __v128 = _mm256_castpd256_pd128(__a); 2172 __builtin_ia32_storeupd(__addr_lo, __v128); 2173 __v128 = _mm256_extractf128_pd(__a, 1); 2174 __builtin_ia32_storeupd(__addr_hi, __v128); 2175 } 2176 2177 static __inline void __DEFAULT_FN_ATTRS 2178 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) 2179 { 2180 __m128i __v128; 2181 2182 __v128 = _mm256_castsi256_si128(__a); 2183 __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128); 2184 __v128 = _mm256_extractf128_si256(__a, 1); 2185 __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128); 2186 } 2187 2188 static __inline __m256 __DEFAULT_FN_ATTRS 2189 _mm256_set_m128 (__m128 __hi, __m128 __lo) { 2190 return (__m256) __builtin_shufflevector(__lo, __hi, 0, 1, 2, 3, 4, 5, 6, 7); 2191 } 2192 2193 static __inline __m256d __DEFAULT_FN_ATTRS 2194 _mm256_set_m128d (__m128d __hi, __m128d __lo) { 2195 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 2196 } 2197 2198 static __inline __m256i __DEFAULT_FN_ATTRS 2199 _mm256_set_m128i (__m128i __hi, __m128i __lo) { 2200 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 2201 } 2202 2203 static __inline __m256 __DEFAULT_FN_ATTRS 2204 _mm256_setr_m128 (__m128 __lo, __m128 __hi) { 2205 return _mm256_set_m128(__hi, __lo); 2206 } 2207 2208 static __inline __m256d __DEFAULT_FN_ATTRS 2209 _mm256_setr_m128d (__m128d __lo, __m128d __hi) { 2210 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 2211 } 2212 2213 static __inline __m256i __DEFAULT_FN_ATTRS 2214 _mm256_setr_m128i (__m128i __lo, __m128i __hi) { 2215 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 2216 } 2217 2218 #undef __DEFAULT_FN_ATTRS 2219 2220 #endif /* __AVXINTRIN_H */ 2221