Home | History | Annotate | Download | only in include
      1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __IMMINTRIN_H
     25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
     26 #endif
     27 
     28 #ifndef __AVXINTRIN_H
     29 #define __AVXINTRIN_H
     30 
     31 typedef double __v4df __attribute__ ((__vector_size__ (32)));
     32 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
     33 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
     34 typedef int __v8si __attribute__ ((__vector_size__ (32)));
     35 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
     36 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
     37 
     38 /* Unsigned types */
     39 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
     40 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
     41 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
     42 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
     43 
     44 /* We need an explicitly signed variant for char. Note that this shouldn't
     45  * appear in the interface though. */
     46 typedef signed char __v32qs __attribute__((__vector_size__(32)));
     47 
     48 typedef float __m256 __attribute__ ((__vector_size__ (32)));
     49 typedef double __m256d __attribute__((__vector_size__(32)));
     50 typedef long long __m256i __attribute__((__vector_size__(32)));
     51 
     52 /* Define the default attributes for the functions in this file. */
     53 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
     54 
     55 /* Arithmetic */
     56 /// \brief Adds two 256-bit vectors of [4 x double].
     57 ///
     58 /// \headerfile <x86intrin.h>
     59 ///
     60 /// This intrinsic corresponds to the <c> VADDPD </c> instruction.
     61 ///
     62 /// \param __a
     63 ///    A 256-bit vector of [4 x double] containing one of the source operands.
     64 /// \param __b
     65 ///    A 256-bit vector of [4 x double] containing one of the source operands.
     66 /// \returns A 256-bit vector of [4 x double] containing the sums of both
     67 ///    operands.
     68 static __inline __m256d __DEFAULT_FN_ATTRS
     69 _mm256_add_pd(__m256d __a, __m256d __b)
     70 {
     71   return (__m256d)((__v4df)__a+(__v4df)__b);
     72 }
     73 
     74 /// \brief Adds two 256-bit vectors of [8 x float].
     75 ///
     76 /// \headerfile <x86intrin.h>
     77 ///
     78 /// This intrinsic corresponds to the <c> VADDPS </c> instruction.
     79 ///
     80 /// \param __a
     81 ///    A 256-bit vector of [8 x float] containing one of the source operands.
     82 /// \param __b
     83 ///    A 256-bit vector of [8 x float] containing one of the source operands.
     84 /// \returns A 256-bit vector of [8 x float] containing the sums of both
     85 ///    operands.
     86 static __inline __m256 __DEFAULT_FN_ATTRS
     87 _mm256_add_ps(__m256 __a, __m256 __b)
     88 {
     89   return (__m256)((__v8sf)__a+(__v8sf)__b);
     90 }
     91 
     92 /// \brief Subtracts two 256-bit vectors of [4 x double].
     93 ///
     94 /// \headerfile <x86intrin.h>
     95 ///
     96 /// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
     97 ///
     98 /// \param __a
     99 ///    A 256-bit vector of [4 x double] containing the minuend.
    100 /// \param __b
    101 ///    A 256-bit vector of [4 x double] containing the subtrahend.
    102 /// \returns A 256-bit vector of [4 x double] containing the differences between
    103 ///    both operands.
    104 static __inline __m256d __DEFAULT_FN_ATTRS
    105 _mm256_sub_pd(__m256d __a, __m256d __b)
    106 {
    107   return (__m256d)((__v4df)__a-(__v4df)__b);
    108 }
    109 
    110 /// \brief Subtracts two 256-bit vectors of [8 x float].
    111 ///
    112 /// \headerfile <x86intrin.h>
    113 ///
    114 /// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
    115 ///
    116 /// \param __a
    117 ///    A 256-bit vector of [8 x float] containing the minuend.
    118 /// \param __b
    119 ///    A 256-bit vector of [8 x float] containing the subtrahend.
    120 /// \returns A 256-bit vector of [8 x float] containing the differences between
    121 ///    both operands.
    122 static __inline __m256 __DEFAULT_FN_ATTRS
    123 _mm256_sub_ps(__m256 __a, __m256 __b)
    124 {
    125   return (__m256)((__v8sf)__a-(__v8sf)__b);
    126 }
    127 
    128 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of
    129 ///    two 256-bit vectors of [4 x double].
    130 ///
    131 /// \headerfile <x86intrin.h>
    132 ///
    133 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
    134 ///
    135 /// \param __a
    136 ///    A 256-bit vector of [4 x double] containing the left source operand.
    137 /// \param __b
    138 ///    A 256-bit vector of [4 x double] containing the right source operand.
    139 /// \returns A 256-bit vector of [4 x double] containing the alternating sums
    140 ///    and differences between both operands.
    141 static __inline __m256d __DEFAULT_FN_ATTRS
    142 _mm256_addsub_pd(__m256d __a, __m256d __b)
    143 {
    144   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
    145 }
    146 
    147 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of
    148 ///    two 256-bit vectors of [8 x float].
    149 ///
    150 /// \headerfile <x86intrin.h>
    151 ///
    152 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
    153 ///
    154 /// \param __a
    155 ///    A 256-bit vector of [8 x float] containing the left source operand.
    156 /// \param __b
    157 ///    A 256-bit vector of [8 x float] containing the right source operand.
    158 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and
    159 ///    differences between both operands.
    160 static __inline __m256 __DEFAULT_FN_ATTRS
    161 _mm256_addsub_ps(__m256 __a, __m256 __b)
    162 {
    163   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
    164 }
    165 
    166 /// \brief Divides two 256-bit vectors of [4 x double].
    167 ///
    168 /// \headerfile <x86intrin.h>
    169 ///
    170 /// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
    171 ///
    172 /// \param __a
    173 ///    A 256-bit vector of [4 x double] containing the dividend.
    174 /// \param __b
    175 ///    A 256-bit vector of [4 x double] containing the divisor.
    176 /// \returns A 256-bit vector of [4 x double] containing the quotients of both
    177 ///    operands.
    178 static __inline __m256d __DEFAULT_FN_ATTRS
    179 _mm256_div_pd(__m256d __a, __m256d __b)
    180 {
    181   return (__m256d)((__v4df)__a/(__v4df)__b);
    182 }
    183 
    184 /// \brief Divides two 256-bit vectors of [8 x float].
    185 ///
    186 /// \headerfile <x86intrin.h>
    187 ///
    188 /// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
    189 ///
    190 /// \param __a
    191 ///    A 256-bit vector of [8 x float] containing the dividend.
    192 /// \param __b
    193 ///    A 256-bit vector of [8 x float] containing the divisor.
    194 /// \returns A 256-bit vector of [8 x float] containing the quotients of both
    195 ///    operands.
    196 static __inline __m256 __DEFAULT_FN_ATTRS
    197 _mm256_div_ps(__m256 __a, __m256 __b)
    198 {
    199   return (__m256)((__v8sf)__a/(__v8sf)__b);
    200 }
    201 
    202 /// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
    203 ///    of each pair of values.
    204 ///
    205 /// \headerfile <x86intrin.h>
    206 ///
    207 /// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
    208 ///
    209 /// \param __a
    210 ///    A 256-bit vector of [4 x double] containing one of the operands.
    211 /// \param __b
    212 ///    A 256-bit vector of [4 x double] containing one of the operands.
    213 /// \returns A 256-bit vector of [4 x double] containing the maximum values
    214 ///    between both operands.
    215 static __inline __m256d __DEFAULT_FN_ATTRS
    216 _mm256_max_pd(__m256d __a, __m256d __b)
    217 {
    218   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
    219 }
    220 
    221 /// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
    222 ///    of each pair of values.
    223 ///
    224 /// \headerfile <x86intrin.h>
    225 ///
    226 /// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
    227 ///
    228 /// \param __a
    229 ///    A 256-bit vector of [8 x float] containing one of the operands.
    230 /// \param __b
    231 ///    A 256-bit vector of [8 x float] containing one of the operands.
    232 /// \returns A 256-bit vector of [8 x float] containing the maximum values
    233 ///    between both operands.
    234 static __inline __m256 __DEFAULT_FN_ATTRS
    235 _mm256_max_ps(__m256 __a, __m256 __b)
    236 {
    237   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
    238 }
    239 
    240 /// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
    241 ///    of each pair of values.
    242 ///
    243 /// \headerfile <x86intrin.h>
    244 ///
    245 /// This intrinsic corresponds to the <c> VMINPD </c> instruction.
    246 ///
    247 /// \param __a
    248 ///    A 256-bit vector of [4 x double] containing one of the operands.
    249 /// \param __b
    250 ///    A 256-bit vector of [4 x double] containing one of the operands.
    251 /// \returns A 256-bit vector of [4 x double] containing the minimum values
    252 ///    between both operands.
    253 static __inline __m256d __DEFAULT_FN_ATTRS
    254 _mm256_min_pd(__m256d __a, __m256d __b)
    255 {
    256   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
    257 }
    258 
    259 /// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
    260 ///    of each pair of values.
    261 ///
    262 /// \headerfile <x86intrin.h>
    263 ///
    264 /// This intrinsic corresponds to the <c> VMINPS </c> instruction.
    265 ///
    266 /// \param __a
    267 ///    A 256-bit vector of [8 x float] containing one of the operands.
    268 /// \param __b
    269 ///    A 256-bit vector of [8 x float] containing one of the operands.
    270 /// \returns A 256-bit vector of [8 x float] containing the minimum values
    271 ///    between both operands.
    272 static __inline __m256 __DEFAULT_FN_ATTRS
    273 _mm256_min_ps(__m256 __a, __m256 __b)
    274 {
    275   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
    276 }
    277 
    278 /// \brief Multiplies two 256-bit vectors of [4 x double].
    279 ///
    280 /// \headerfile <x86intrin.h>
    281 ///
    282 /// This intrinsic corresponds to the <c> VMULPD </c> instruction.
    283 ///
    284 /// \param __a
    285 ///    A 256-bit vector of [4 x double] containing one of the operands.
    286 /// \param __b
    287 ///    A 256-bit vector of [4 x double] containing one of the operands.
    288 /// \returns A 256-bit vector of [4 x double] containing the products of both
    289 ///    operands.
    290 static __inline __m256d __DEFAULT_FN_ATTRS
    291 _mm256_mul_pd(__m256d __a, __m256d __b)
    292 {
    293   return (__m256d)((__v4df)__a * (__v4df)__b);
    294 }
    295 
    296 /// \brief Multiplies two 256-bit vectors of [8 x float].
    297 ///
    298 /// \headerfile <x86intrin.h>
    299 ///
    300 /// This intrinsic corresponds to the <c> VMULPS </c> instruction.
    301 ///
    302 /// \param __a
    303 ///    A 256-bit vector of [8 x float] containing one of the operands.
    304 /// \param __b
    305 ///    A 256-bit vector of [8 x float] containing one of the operands.
    306 /// \returns A 256-bit vector of [8 x float] containing the products of both
    307 ///    operands.
    308 static __inline __m256 __DEFAULT_FN_ATTRS
    309 _mm256_mul_ps(__m256 __a, __m256 __b)
    310 {
    311   return (__m256)((__v8sf)__a * (__v8sf)__b);
    312 }
    313 
    314 /// \brief Calculates the square roots of the values in a 256-bit vector of
    315 ///    [4 x double].
    316 ///
    317 /// \headerfile <x86intrin.h>
    318 ///
    319 /// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
    320 ///
    321 /// \param __a
    322 ///    A 256-bit vector of [4 x double].
    323 /// \returns A 256-bit vector of [4 x double] containing the square roots of the
    324 ///    values in the operand.
    325 static __inline __m256d __DEFAULT_FN_ATTRS
    326 _mm256_sqrt_pd(__m256d __a)
    327 {
    328   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
    329 }
    330 
    331 /// \brief Calculates the square roots of the values in a 256-bit vector of
    332 ///    [8 x float].
    333 ///
    334 /// \headerfile <x86intrin.h>
    335 ///
    336 /// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
    337 ///
    338 /// \param __a
    339 ///    A 256-bit vector of [8 x float].
    340 /// \returns A 256-bit vector of [8 x float] containing the square roots of the
    341 ///    values in the operand.
    342 static __inline __m256 __DEFAULT_FN_ATTRS
    343 _mm256_sqrt_ps(__m256 __a)
    344 {
    345   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
    346 }
    347 
    348 /// \brief Calculates the reciprocal square roots of the values in a 256-bit
    349 ///    vector of [8 x float].
    350 ///
    351 /// \headerfile <x86intrin.h>
    352 ///
    353 /// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
    354 ///
    355 /// \param __a
    356 ///    A 256-bit vector of [8 x float].
    357 /// \returns A 256-bit vector of [8 x float] containing the reciprocal square
    358 ///    roots of the values in the operand.
    359 static __inline __m256 __DEFAULT_FN_ATTRS
    360 _mm256_rsqrt_ps(__m256 __a)
    361 {
    362   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
    363 }
    364 
    365 /// \brief Calculates the reciprocals of the values in a 256-bit vector of
    366 ///    [8 x float].
    367 ///
    368 /// \headerfile <x86intrin.h>
    369 ///
    370 /// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
    371 ///
    372 /// \param __a
    373 ///    A 256-bit vector of [8 x float].
    374 /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
    375 ///    values in the operand.
    376 static __inline __m256 __DEFAULT_FN_ATTRS
    377 _mm256_rcp_ps(__m256 __a)
    378 {
    379   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
    380 }
    381 
    382 /// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
    383 ///    by the byte operand. The source values are rounded to integer values and
    384 ///    returned as 64-bit double-precision floating-point values.
    385 ///
    386 /// \headerfile <x86intrin.h>
    387 ///
    388 /// \code
    389 /// __m256d _mm256_round_pd(__m256d V, const int M);
    390 /// \endcode
    391 ///
    392 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    393 ///
    394 /// \param V
    395 ///    A 256-bit vector of [4 x double].
    396 /// \param M
    397 ///    An integer value that specifies the rounding operation. \n
    398 ///    Bits [7:4] are reserved. \n
    399 ///    Bit [3] is a precision exception value: \n
    400 ///      0: A normal PE exception is used. \n
    401 ///      1: The PE field is not updated. \n
    402 ///    Bit [2] is the rounding control source: \n
    403 ///      0: Use bits [1:0] of \a M. \n
    404 ///      1: Use the current MXCSR setting. \n
    405 ///    Bits [1:0] contain the rounding control definition: \n
    406 ///      00: Nearest. \n
    407 ///      01: Downward (toward negative infinity). \n
    408 ///      10: Upward (toward positive infinity). \n
    409 ///      11: Truncated.
    410 /// \returns A 256-bit vector of [4 x double] containing the rounded values.
    411 #define _mm256_round_pd(V, M) __extension__ ({ \
    412     (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
    413 
    414 /// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
    415 ///    specified by the byte operand. The source values are rounded to integer
    416 ///    values and returned as floating-point values.
    417 ///
    418 /// \headerfile <x86intrin.h>
    419 ///
    420 /// \code
    421 /// __m256 _mm256_round_ps(__m256 V, const int M);
    422 /// \endcode
    423 ///
    424 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    425 ///
    426 /// \param V
    427 ///    A 256-bit vector of [8 x float].
    428 /// \param M
    429 ///    An integer value that specifies the rounding operation. \n
    430 ///    Bits [7:4] are reserved. \n
    431 ///    Bit [3] is a precision exception value: \n
    432 ///      0: A normal PE exception is used. \n
    433 ///      1: The PE field is not updated. \n
    434 ///    Bit [2] is the rounding control source: \n
    435 ///      0: Use bits [1:0] of \a M. \n
    436 ///      1: Use the current MXCSR setting. \n
    437 ///    Bits [1:0] contain the rounding control definition: \n
    438 ///      00: Nearest. \n
    439 ///      01: Downward (toward negative infinity). \n
    440 ///      10: Upward (toward positive infinity). \n
    441 ///      11: Truncated.
    442 /// \returns A 256-bit vector of [8 x float] containing the rounded values.
    443 #define _mm256_round_ps(V, M) __extension__ ({ \
    444   (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
    445 
    446 /// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
    447 ///    source values are rounded up to integer values and returned as 64-bit
    448 ///    double-precision floating-point values.
    449 ///
    450 /// \headerfile <x86intrin.h>
    451 ///
    452 /// \code
    453 /// __m256d _mm256_ceil_pd(__m256d V);
    454 /// \endcode
    455 ///
    456 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    457 ///
    458 /// \param V
    459 ///    A 256-bit vector of [4 x double].
    460 /// \returns A 256-bit vector of [4 x double] containing the rounded up values.
    461 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
    462 
    463 /// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
    464 ///    The source values are rounded down to integer values and returned as
    465 ///    64-bit double-precision floating-point values.
    466 ///
    467 /// \headerfile <x86intrin.h>
    468 ///
    469 /// \code
    470 /// __m256d _mm256_floor_pd(__m256d V);
    471 /// \endcode
    472 ///
    473 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    474 ///
    475 /// \param V
    476 ///    A 256-bit vector of [4 x double].
    477 /// \returns A 256-bit vector of [4 x double] containing the rounded down
    478 ///    values.
    479 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
    480 
    481 /// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
    482 ///    source values are rounded up to integer values and returned as
    483 ///    floating-point values.
    484 ///
    485 /// \headerfile <x86intrin.h>
    486 ///
    487 /// \code
    488 /// __m256 _mm256_ceil_ps(__m256 V);
    489 /// \endcode
    490 ///
    491 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    492 ///
    493 /// \param V
    494 ///    A 256-bit vector of [8 x float].
    495 /// \returns A 256-bit vector of [8 x float] containing the rounded up values.
    496 #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
    497 
    498 /// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
    499 ///    source values are rounded down to integer values and returned as
    500 ///    floating-point values.
    501 ///
    502 /// \headerfile <x86intrin.h>
    503 ///
    504 /// \code
    505 /// __m256 _mm256_floor_ps(__m256 V);
    506 /// \endcode
    507 ///
    508 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    509 ///
    510 /// \param V
    511 ///    A 256-bit vector of [8 x float].
    512 /// \returns A 256-bit vector of [8 x float] containing the rounded down values.
    513 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
    514 
    515 /* Logical */
    516 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
    517 ///
    518 /// \headerfile <x86intrin.h>
    519 ///
    520 /// This intrinsic corresponds to the <c> VANDPD </c> instruction.
    521 ///
    522 /// \param __a
    523 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    524 /// \param __b
    525 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    526 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
    527 ///    values between both operands.
    528 static __inline __m256d __DEFAULT_FN_ATTRS
    529 _mm256_and_pd(__m256d __a, __m256d __b)
    530 {
    531   return (__m256d)((__v4du)__a & (__v4du)__b);
    532 }
    533 
    534 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
    535 ///
    536 /// \headerfile <x86intrin.h>
    537 ///
    538 /// This intrinsic corresponds to the <c> VANDPS </c> instruction.
    539 ///
    540 /// \param __a
    541 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    542 /// \param __b
    543 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    544 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
    545 ///    values between both operands.
    546 static __inline __m256 __DEFAULT_FN_ATTRS
    547 _mm256_and_ps(__m256 __a, __m256 __b)
    548 {
    549   return (__m256)((__v8su)__a & (__v8su)__b);
    550 }
    551 
    552 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
    553 ///    the one's complement of the values contained in the first source operand.
    554 ///
    555 /// \headerfile <x86intrin.h>
    556 ///
    557 /// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
    558 ///
    559 /// \param __a
    560 ///    A 256-bit vector of [4 x double] containing the left source operand. The
    561 ///    one's complement of this value is used in the bitwise AND.
    562 /// \param __b
    563 ///    A 256-bit vector of [4 x double] containing the right source operand.
    564 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
    565 ///    values of the second operand and the one's complement of the first
    566 ///    operand.
    567 static __inline __m256d __DEFAULT_FN_ATTRS
    568 _mm256_andnot_pd(__m256d __a, __m256d __b)
    569 {
    570   return (__m256d)(~(__v4du)__a & (__v4du)__b);
    571 }
    572 
    573 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
    574 ///    the one's complement of the values contained in the first source operand.
    575 ///
    576 /// \headerfile <x86intrin.h>
    577 ///
    578 /// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
    579 ///
    580 /// \param __a
    581 ///    A 256-bit vector of [8 x float] containing the left source operand. The
    582 ///    one's complement of this value is used in the bitwise AND.
    583 /// \param __b
    584 ///    A 256-bit vector of [8 x float] containing the right source operand.
    585 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
    586 ///    values of the second operand and the one's complement of the first
    587 ///    operand.
    588 static __inline __m256 __DEFAULT_FN_ATTRS
    589 _mm256_andnot_ps(__m256 __a, __m256 __b)
    590 {
    591   return (__m256)(~(__v8su)__a & (__v8su)__b);
    592 }
    593 
    594 /// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
    595 ///
    596 /// \headerfile <x86intrin.h>
    597 ///
    598 /// This intrinsic corresponds to the <c> VORPD </c> instruction.
    599 ///
    600 /// \param __a
    601 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    602 /// \param __b
    603 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    604 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
    605 ///    values between both operands.
    606 static __inline __m256d __DEFAULT_FN_ATTRS
    607 _mm256_or_pd(__m256d __a, __m256d __b)
    608 {
    609   return (__m256d)((__v4du)__a | (__v4du)__b);
    610 }
    611 
    612 /// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
    613 ///
    614 /// \headerfile <x86intrin.h>
    615 ///
    616 /// This intrinsic corresponds to the <c> VORPS </c> instruction.
    617 ///
    618 /// \param __a
    619 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    620 /// \param __b
    621 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    622 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
    623 ///    values between both operands.
    624 static __inline __m256 __DEFAULT_FN_ATTRS
    625 _mm256_or_ps(__m256 __a, __m256 __b)
    626 {
    627   return (__m256)((__v8su)__a | (__v8su)__b);
    628 }
    629 
    630 /// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
    631 ///
    632 /// \headerfile <x86intrin.h>
    633 ///
    634 /// This intrinsic corresponds to the <c> VXORPD </c> instruction.
    635 ///
    636 /// \param __a
    637 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    638 /// \param __b
    639 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    640 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
    641 ///    values between both operands.
    642 static __inline __m256d __DEFAULT_FN_ATTRS
    643 _mm256_xor_pd(__m256d __a, __m256d __b)
    644 {
    645   return (__m256d)((__v4du)__a ^ (__v4du)__b);
    646 }
    647 
    648 /// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
    649 ///
    650 /// \headerfile <x86intrin.h>
    651 ///
    652 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
    653 ///
    654 /// \param __a
    655 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    656 /// \param __b
    657 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    658 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
    659 ///    values between both operands.
    660 static __inline __m256 __DEFAULT_FN_ATTRS
    661 _mm256_xor_ps(__m256 __a, __m256 __b)
    662 {
    663   return (__m256)((__v8su)__a ^ (__v8su)__b);
    664 }
    665 
    666 /* Horizontal arithmetic */
    667 /// \brief Horizontally adds the adjacent pairs of values contained in two
    668 ///    256-bit vectors of [4 x double].
    669 ///
    670 /// \headerfile <x86intrin.h>
    671 ///
    672 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
    673 ///
    674 /// \param __a
    675 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    676 ///    The horizontal sums of the values are returned in the even-indexed
    677 ///    elements of a vector of [4 x double].
    678 /// \param __b
    679 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    680 ///    The horizontal sums of the values are returned in the odd-indexed
    681 ///    elements of a vector of [4 x double].
    682 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
    683 ///    both operands.
    684 static __inline __m256d __DEFAULT_FN_ATTRS
    685 _mm256_hadd_pd(__m256d __a, __m256d __b)
    686 {
    687   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
    688 }
    689 
    690 /// \brief Horizontally adds the adjacent pairs of values contained in two
    691 ///    256-bit vectors of [8 x float].
    692 ///
    693 /// \headerfile <x86intrin.h>
    694 ///
    695 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
    696 ///
    697 /// \param __a
    698 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    699 ///    The horizontal sums of the values are returned in the elements with
    700 ///    index 0, 1, 4, 5 of a vector of [8 x float].
    701 /// \param __b
    702 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    703 ///    The horizontal sums of the values are returned in the elements with
    704 ///    index 2, 3, 6, 7 of a vector of [8 x float].
    705 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
    706 ///    both operands.
    707 static __inline __m256 __DEFAULT_FN_ATTRS
    708 _mm256_hadd_ps(__m256 __a, __m256 __b)
    709 {
    710   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
    711 }
    712 
    713 /// \brief Horizontally subtracts the adjacent pairs of values contained in two
    714 ///    256-bit vectors of [4 x double].
    715 ///
    716 /// \headerfile <x86intrin.h>
    717 ///
    718 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
    719 ///
    720 /// \param __a
    721 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    722 ///    The horizontal differences between the values are returned in the
    723 ///    even-indexed elements of a vector of [4 x double].
    724 /// \param __b
    725 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    726 ///    The horizontal differences between the values are returned in the
    727 ///    odd-indexed elements of a vector of [4 x double].
    728 /// \returns A 256-bit vector of [4 x double] containing the horizontal
    729 ///    differences of both operands.
    730 static __inline __m256d __DEFAULT_FN_ATTRS
    731 _mm256_hsub_pd(__m256d __a, __m256d __b)
    732 {
    733   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
    734 }
    735 
    736 /// \brief Horizontally subtracts the adjacent pairs of values contained in two
    737 ///    256-bit vectors of [8 x float].
    738 ///
    739 /// \headerfile <x86intrin.h>
    740 ///
    741 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
    742 ///
    743 /// \param __a
    744 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    745 ///    The horizontal differences between the values are returned in the
    746 ///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
    747 /// \param __b
    748 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    749 ///    The horizontal differences between the values are returned in the
    750 ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
    751 /// \returns A 256-bit vector of [8 x float] containing the horizontal
    752 ///    differences of both operands.
    753 static __inline __m256 __DEFAULT_FN_ATTRS
    754 _mm256_hsub_ps(__m256 __a, __m256 __b)
    755 {
    756   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
    757 }
    758 
    759 /* Vector permutations */
    760 /// \brief Copies the values in a 128-bit vector of [2 x double] as specified
    761 ///    by the 128-bit integer vector operand.
    762 ///
    763 /// \headerfile <x86intrin.h>
    764 ///
    765 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    766 ///
    767 /// \param __a
    768 ///    A 128-bit vector of [2 x double].
    769 /// \param __c
    770 ///    A 128-bit integer vector operand specifying how the values are to be
    771 ///    copied. \n
    772 ///    Bit [1]: \n
    773 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
    774 ///         vector. \n
    775 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    776 ///         returned vector. \n
    777 ///    Bit [65]: \n
    778 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    779 ///         returned vector. \n
    780 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    781 ///         returned vector.
    782 /// \returns A 128-bit vector of [2 x double] containing the copied values.
    783 static __inline __m128d __DEFAULT_FN_ATTRS
    784 _mm_permutevar_pd(__m128d __a, __m128i __c)
    785 {
    786   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
    787 }
    788 
    789 /// \brief Copies the values in a 256-bit vector of [4 x double] as specified
    790 ///    by the 256-bit integer vector operand.
    791 ///
    792 /// \headerfile <x86intrin.h>
    793 ///
    794 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    795 ///
    796 /// \param __a
    797 ///    A 256-bit vector of [4 x double].
    798 /// \param __c
    799 ///    A 256-bit integer vector operand specifying how the values are to be
    800 ///    copied. \n
    801 ///    Bit [1]: \n
    802 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
    803 ///         vector. \n
    804 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    805 ///         returned vector. \n
    806 ///    Bit [65]: \n
    807 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    808 ///         returned vector. \n
    809 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    810 ///         returned vector. \n
    811 ///    Bit [129]: \n
    812 ///      0: Bits [191:128] of the source are copied to bits [191:128] of the
    813 ///         returned vector. \n
    814 ///      1: Bits [255:192] of the source are copied to bits [191:128] of the
    815 ///         returned vector. \n
    816 ///    Bit [193]: \n
    817 ///      0: Bits [191:128] of the source are copied to bits [255:192] of the
    818 ///         returned vector. \n
    819 ///      1: Bits [255:192] of the source are copied to bits [255:192] of the
    820 ///    returned vector.
    821 /// \returns A 256-bit vector of [4 x double] containing the copied values.
    822 static __inline __m256d __DEFAULT_FN_ATTRS
    823 _mm256_permutevar_pd(__m256d __a, __m256i __c)
    824 {
    825   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
    826 }
    827 
    828 /// \brief Copies the values stored in a 128-bit vector of [4 x float] as
    829 ///    specified by the 128-bit integer vector operand.
    830 /// \headerfile <x86intrin.h>
    831 ///
    832 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
    833 ///
    834 /// \param __a
    835 ///    A 128-bit vector of [4 x float].
    836 /// \param __c
    837 ///    A 128-bit integer vector operand specifying how the values are to be
    838 ///    copied. \n
    839 ///    Bits [1:0]: \n
    840 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
    841 ///          returned vector. \n
    842 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
    843 ///          returned vector. \n
    844 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
    845 ///          returned vector. \n
    846 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
    847 ///          returned vector. \n
    848 ///    Bits [33:32]: \n
    849 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
    850 ///          returned vector. \n
    851 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
    852 ///          returned vector. \n
    853 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
    854 ///          returned vector. \n
    855 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
    856 ///          returned vector. \n
    857 ///    Bits [65:64]: \n
    858 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
    859 ///          returned vector. \n
    860 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
    861 ///          returned vector. \n
    862 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
    863 ///          returned vector. \n
    864 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
    865 ///          returned vector. \n
    866 ///    Bits [97:96]: \n
    867 ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
    868 ///          returned vector. \n
    869 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
    870 ///          returned vector. \n
    871 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
    872 ///          returned vector. \n
    873 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
    874 ///          returned vector.
    875 /// \returns A 128-bit vector of [4 x float] containing the copied values.
    876 static __inline __m128 __DEFAULT_FN_ATTRS
    877 _mm_permutevar_ps(__m128 __a, __m128i __c)
    878 {
    879   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
    880 }
    881 
    882 /// \brief Copies the values stored in a 256-bit vector of [8 x float] as
    883 ///    specified by the 256-bit integer vector operand.
    884 ///
    885 /// \headerfile <x86intrin.h>
    886 ///
    887 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
    888 ///
    889 /// \param __a
    890 ///    A 256-bit vector of [8 x float].
    891 /// \param __c
    892 ///    A 256-bit integer vector operand specifying how the values are to be
    893 ///    copied. \n
    894 ///    Bits [1:0]: \n
    895 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
    896 ///          returned vector. \n
    897 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
    898 ///          returned vector. \n
    899 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
    900 ///          returned vector. \n
    901 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
    902 ///          returned vector. \n
    903 ///    Bits [33:32]: \n
    904 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
    905 ///          returned vector. \n
    906 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
    907 ///          returned vector. \n
    908 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
    909 ///          returned vector. \n
    910 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
    911 ///          returned vector. \n
    912 ///    Bits [65:64]: \n
    913 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
    914 ///          returned vector. \n
    915 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
    916 ///          returned vector. \n
    917 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
    918 ///          returned vector. \n
    919 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
    920 ///          returned vector. \n
    921 ///    Bits [97:96]: \n
    922 ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
    923 ///          returned vector. \n
    924 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
    925 ///          returned vector. \n
    926 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
    927 ///          returned vector. \n
    928 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
    929 ///          returned vector. \n
    930 ///    Bits [129:128]: \n
    931 ///      00: Bits [159:128] of the source are copied to bits [159:128] of the
    932 ///          returned vector. \n
    933 ///      01: Bits [191:160] of the source are copied to bits [159:128] of the
    934 ///          returned vector. \n
    935 ///      10: Bits [223:192] of the source are copied to bits [159:128] of the
    936 ///          returned vector. \n
    937 ///      11: Bits [255:224] of the source are copied to bits [159:128] of the
    938 ///          returned vector. \n
    939 ///    Bits [161:160]: \n
    940 ///      00: Bits [159:128] of the source are copied to bits [191:160] of the
    941 ///          returned vector. \n
    942 ///      01: Bits [191:160] of the source are copied to bits [191:160] of the
    943 ///          returned vector. \n
    944 ///      10: Bits [223:192] of the source are copied to bits [191:160] of the
    945 ///          returned vector. \n
    946 ///      11: Bits [255:224] of the source are copied to bits [191:160] of the
    947 ///          returned vector. \n
    948 ///    Bits [193:192]: \n
    949 ///      00: Bits [159:128] of the source are copied to bits [223:192] of the
    950 ///          returned vector. \n
    951 ///      01: Bits [191:160] of the source are copied to bits [223:192] of the
    952 ///          returned vector. \n
    953 ///      10: Bits [223:192] of the source are copied to bits [223:192] of the
    954 ///          returned vector. \n
    955 ///      11: Bits [255:224] of the source are copied to bits [223:192] of the
    956 ///          returned vector. \n
    957 ///    Bits [225:224]: \n
    958 ///      00: Bits [159:128] of the source are copied to bits [255:224] of the
    959 ///          returned vector. \n
    960 ///      01: Bits [191:160] of the source are copied to bits [255:224] of the
    961 ///          returned vector. \n
    962 ///      10: Bits [223:192] of the source are copied to bits [255:224] of the
    963 ///          returned vector. \n
    964 ///      11: Bits [255:224] of the source are copied to bits [255:224] of the
    965 ///          returned vector.
    966 /// \returns A 256-bit vector of [8 x float] containing the copied values.
    967 static __inline __m256 __DEFAULT_FN_ATTRS
    968 _mm256_permutevar_ps(__m256 __a, __m256i __c)
    969 {
    970   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
    971 }
    972 
    973 /// \brief Copies the values in a 128-bit vector of [2 x double] as specified
    974 ///    by the immediate integer operand.
    975 ///
    976 /// \headerfile <x86intrin.h>
    977 ///
    978 /// \code
    979 /// __m128d _mm_permute_pd(__m128d A, const int C);
    980 /// \endcode
    981 ///
    982 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    983 ///
    984 /// \param A
    985 ///    A 128-bit vector of [2 x double].
    986 /// \param C
    987 ///    An immediate integer operand specifying how the values are to be
    988 ///    copied. \n
    989 ///    Bit [0]: \n
    990 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
    991 ///         vector. \n
    992 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    993 ///         returned vector. \n
    994 ///    Bit [1]: \n
    995 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    996 ///         returned vector. \n
    997 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    998 ///         returned vector.
    999 /// \returns A 128-bit vector of [2 x double] containing the copied values.
   1000 #define _mm_permute_pd(A, C) __extension__ ({ \
   1001   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
   1002                                    (__v2df)_mm_undefined_pd(), \
   1003                                    ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
   1004 
   1005 /// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
   1006 ///    the immediate integer operand.
   1007 ///
   1008 /// \headerfile <x86intrin.h>
   1009 ///
   1010 /// \code
   1011 /// __m256d _mm256_permute_pd(__m256d A, const int C);
   1012 /// \endcode
   1013 ///
   1014 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
   1015 ///
   1016 /// \param A
   1017 ///    A 256-bit vector of [4 x double].
   1018 /// \param C
   1019 ///    An immediate integer operand specifying how the values are to be
   1020 ///    copied. \n
   1021 ///    Bit [0]: \n
   1022 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
   1023 ///         vector. \n
   1024 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
   1025 ///         returned vector. \n
   1026 ///    Bit [1]: \n
   1027 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
   1028 ///         returned vector. \n
   1029 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
   1030 ///         returned vector. \n
   1031 ///    Bit [2]: \n
   1032 ///      0: Bits [191:128] of the source are copied to bits [191:128] of the
   1033 ///         returned vector. \n
   1034 ///      1: Bits [255:192] of the source are copied to bits [191:128] of the
   1035 ///         returned vector. \n
   1036 ///    Bit [3]: \n
   1037 ///      0: Bits [191:128] of the source are copied to bits [255:192] of the
   1038 ///         returned vector. \n
   1039 ///      1: Bits [255:192] of the source are copied to bits [255:192] of the
   1040 ///         returned vector.
   1041 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1042 #define _mm256_permute_pd(A, C) __extension__ ({ \
   1043   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
   1044                                    (__v4df)_mm256_undefined_pd(), \
   1045                                    0 + (((C) >> 0) & 0x1), \
   1046                                    0 + (((C) >> 1) & 0x1), \
   1047                                    2 + (((C) >> 2) & 0x1), \
   1048                                    2 + (((C) >> 3) & 0x1)); })
   1049 
   1050 /// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
   1051 ///    the immediate integer operand.
   1052 ///
   1053 /// \headerfile <x86intrin.h>
   1054 ///
   1055 /// \code
   1056 /// __m128 _mm_permute_ps(__m128 A, const int C);
   1057 /// \endcode
   1058 ///
   1059 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
   1060 ///
   1061 /// \param A
   1062 ///    A 128-bit vector of [4 x float].
   1063 /// \param C
   1064 ///    An immediate integer operand specifying how the values are to be
   1065 ///    copied. \n
   1066 ///    Bits [1:0]: \n
   1067 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
   1068 ///          returned vector. \n
   1069 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
   1070 ///          returned vector. \n
   1071 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
   1072 ///          returned vector. \n
   1073 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
   1074 ///          returned vector. \n
   1075 ///    Bits [3:2]: \n
   1076 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
   1077 ///          returned vector. \n
   1078 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
   1079 ///          returned vector. \n
   1080 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
   1081 ///          returned vector. \n
   1082 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
   1083 ///          returned vector. \n
   1084 ///    Bits [5:4]: \n
   1085 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
   1086 ///          returned vector. \n
   1087 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
   1088 ///          returned vector. \n
   1089 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
   1090 ///          returned vector. \n
   1091 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
   1092 ///          returned vector. \n
   1093 ///    Bits [7:6]: \n
   1094 ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
   1095 ///          returned vector. \n
   1096 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
   1097 ///          returned vector. \n
   1098 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
   1099 ///          returned vector. \n
   1100 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
   1101 ///          returned vector.
   1102 /// \returns A 128-bit vector of [4 x float] containing the copied values.
   1103 #define _mm_permute_ps(A, C) __extension__ ({ \
   1104   (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
   1105                                   (__v4sf)_mm_undefined_ps(), \
   1106                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
   1107                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
   1108 
   1109 /// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
   1110 ///    the immediate integer operand.
   1111 ///
   1112 /// \headerfile <x86intrin.h>
   1113 ///
   1114 /// \code
   1115 /// __m256 _mm256_permute_ps(__m256 A, const int C);
   1116 /// \endcode
   1117 ///
   1118 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
   1119 ///
   1120 /// \param A
   1121 ///    A 256-bit vector of [8 x float].
   1122 /// \param C
   1123 ///    An immediate integer operand specifying how the values are to be \n
   1124 ///    copied. \n
   1125 ///    Bits [1:0]: \n
   1126 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
   1127 ///          returned vector. \n
   1128 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
   1129 ///          returned vector. \n
   1130 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
   1131 ///          returned vector. \n
   1132 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
   1133 ///          returned vector. \n
   1134 ///    Bits [3:2]: \n
   1135 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
   1136 ///          returned vector. \n
   1137 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
   1138 ///          returned vector. \n
   1139 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
   1140 ///          returned vector. \n
   1141 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
   1142 ///          returned vector. \n
   1143 ///    Bits [5:4]: \n
   1144 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
   1145 ///          returned vector. \n
   1146 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
   1147 ///          returned vector. \n
   1148 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
   1149 ///          returned vector. \n
   1150 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
   1151 ///          returned vector. \n
   1152 ///    Bits [7:6]: \n
   1153 ///      00: Bits [31:qq0] of the source are copied to bits [127:96] of the
   1154 ///          returned vector. \n
   1155 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
   1156 ///          returned vector. \n
   1157 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
   1158 ///          returned vector. \n
   1159 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
   1160 ///          returned vector. \n
   1161 ///    Bits [1:0]: \n
   1162 ///      00: Bits [159:128] of the source are copied to bits [159:128] of the
   1163 ///          returned vector. \n
   1164 ///      01: Bits [191:160] of the source are copied to bits [159:128] of the
   1165 ///          returned vector. \n
   1166 ///      10: Bits [223:192] of the source are copied to bits [159:128] of the
   1167 ///          returned vector. \n
   1168 ///      11: Bits [255:224] of the source are copied to bits [159:128] of the
   1169 ///          returned vector. \n
   1170 ///    Bits [3:2]: \n
   1171 ///      00: Bits [159:128] of the source are copied to bits [191:160] of the
   1172 ///          returned vector. \n
   1173 ///      01: Bits [191:160] of the source are copied to bits [191:160] of the
   1174 ///          returned vector. \n
   1175 ///      10: Bits [223:192] of the source are copied to bits [191:160] of the
   1176 ///          returned vector. \n
   1177 ///      11: Bits [255:224] of the source are copied to bits [191:160] of the
   1178 ///          returned vector. \n
   1179 ///    Bits [5:4]: \n
   1180 ///      00: Bits [159:128] of the source are copied to bits [223:192] of the
   1181 ///          returned vector. \n
   1182 ///      01: Bits [191:160] of the source are copied to bits [223:192] of the
   1183 ///          returned vector. \n
   1184 ///      10: Bits [223:192] of the source are copied to bits [223:192] of the
   1185 ///          returned vector. \n
   1186 ///      11: Bits [255:224] of the source are copied to bits [223:192] of the
   1187 ///          returned vector. \n
   1188 ///    Bits [7:6]: \n
   1189 ///      00: Bits [159:128] of the source are copied to bits [255:224] of the
   1190 ///          returned vector. \n
   1191 ///      01: Bits [191:160] of the source are copied to bits [255:224] of the
   1192 ///          returned vector. \n
   1193 ///      10: Bits [223:192] of the source are copied to bits [255:224] of the
   1194 ///          returned vector. \n
   1195 ///      11: Bits [255:224] of the source are copied to bits [255:224] of the
   1196 ///          returned vector.
   1197 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1198 #define _mm256_permute_ps(A, C) __extension__ ({ \
   1199   (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
   1200                                   (__v8sf)_mm256_undefined_ps(), \
   1201                                   0 + (((C) >> 0) & 0x3), \
   1202                                   0 + (((C) >> 2) & 0x3), \
   1203                                   0 + (((C) >> 4) & 0x3), \
   1204                                   0 + (((C) >> 6) & 0x3), \
   1205                                   4 + (((C) >> 0) & 0x3), \
   1206                                   4 + (((C) >> 2) & 0x3), \
   1207                                   4 + (((C) >> 4) & 0x3), \
   1208                                   4 + (((C) >> 6) & 0x3)); })
   1209 
   1210 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of
   1211 ///    [4 x double], as specified by the immediate integer operand.
   1212 ///
   1213 /// \headerfile <x86intrin.h>
   1214 ///
   1215 /// \code
   1216 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
   1217 /// \endcode
   1218 ///
   1219 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
   1220 ///
   1221 /// \param V1
   1222 ///    A 256-bit vector of [4 x double].
   1223 /// \param V2
   1224 ///    A 256-bit vector of [4 x double.
   1225 /// \param M
   1226 ///    An immediate integer operand specifying how the values are to be
   1227 ///    permuted. \n
   1228 ///    Bits [1:0]: \n
   1229 ///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
   1230 ///          destination. \n
   1231 ///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
   1232 ///          destination. \n
   1233 ///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
   1234 ///          destination. \n
   1235 ///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
   1236 ///          destination. \n
   1237 ///    Bits [5:4]: \n
   1238 ///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
   1239 ///          destination. \n
   1240 ///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
   1241 ///          destination. \n
   1242 ///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
   1243 ///          destination. \n
   1244 ///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
   1245 ///          destination.
   1246 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1247 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
   1248   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
   1249                                            (__v4df)(__m256d)(V2), (M)); })
   1250 
   1251 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of
   1252 ///    [8 x float], as specified by the immediate integer operand.
   1253 ///
   1254 /// \headerfile <x86intrin.h>
   1255 ///
   1256 /// \code
   1257 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
   1258 /// \endcode
   1259 ///
   1260 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
   1261 ///
   1262 /// \param V1
   1263 ///    A 256-bit vector of [8 x float].
   1264 /// \param V2
   1265 ///    A 256-bit vector of [8 x float].
   1266 /// \param M
   1267 ///    An immediate integer operand specifying how the values are to be
   1268 ///    permuted. \n
   1269 ///    Bits [1:0]: \n
   1270 ///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
   1271 ///    destination. \n
   1272 ///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
   1273 ///    destination. \n
   1274 ///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
   1275 ///    destination. \n
   1276 ///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
   1277 ///    destination. \n
   1278 ///    Bits [5:4]: \n
   1279 ///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
   1280 ///    destination. \n
   1281 ///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
   1282 ///    destination. \n
   1283 ///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
   1284 ///    destination. \n
   1285 ///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
   1286 ///    destination.
   1287 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1288 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
   1289   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
   1290                                           (__v8sf)(__m256)(V2), (M)); })
   1291 
   1292 /// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
   1293 ///    as specified by the immediate integer operand.
   1294 ///
   1295 /// \headerfile <x86intrin.h>
   1296 ///
   1297 /// \code
   1298 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
   1299 /// \endcode
   1300 ///
   1301 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
   1302 ///
   1303 /// \param V1
   1304 ///    A 256-bit integer vector.
   1305 /// \param V2
   1306 ///    A 256-bit integer vector.
   1307 /// \param M
   1308 ///    An immediate integer operand specifying how the values are to be copied.
   1309 ///    Bits [1:0]: \n
   1310 ///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
   1311 ///    destination. \n
   1312 ///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
   1313 ///    destination. \n
   1314 ///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
   1315 ///    destination. \n
   1316 ///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
   1317 ///    destination. \n
   1318 ///    Bits [5:4]: \n
   1319 ///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
   1320 ///    destination. \n
   1321 ///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
   1322 ///    destination. \n
   1323 ///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
   1324 ///    destination. \n
   1325 ///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
   1326 ///    destination.
   1327 /// \returns A 256-bit integer vector containing the copied values.
   1328 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
   1329   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
   1330                                            (__v8si)(__m256i)(V2), (M)); })
   1331 
   1332 /* Vector Blend */
   1333 /// \brief Merges 64-bit double-precision data values stored in either of the
   1334 ///    two 256-bit vectors of [4 x double], as specified by the immediate
   1335 ///    integer operand.
   1336 ///
   1337 /// \headerfile <x86intrin.h>
   1338 ///
   1339 /// \code
   1340 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
   1341 /// \endcode
   1342 ///
   1343 /// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
   1344 ///
   1345 /// \param V1
   1346 ///    A 256-bit vector of [4 x double].
   1347 /// \param V2
   1348 ///    A 256-bit vector of [4 x double].
   1349 /// \param M
   1350 ///    An immediate integer operand, with mask bits [3:0] specifying how the
   1351 ///    values are to be copied. The position of the mask bit corresponds to the
   1352 ///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
   1353 ///    element in operand \a V1 is copied to the same position in the
   1354 ///    destination. When a mask bit is 1, the corresponding 64-bit element in
   1355 ///    operand \a V2 is copied to the same position in the destination.
   1356 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1357 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
   1358   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
   1359                                    (__v4df)(__m256d)(V2), \
   1360                                    (((M) & 0x01) ? 4 : 0), \
   1361                                    (((M) & 0x02) ? 5 : 1), \
   1362                                    (((M) & 0x04) ? 6 : 2), \
   1363                                    (((M) & 0x08) ? 7 : 3)); })
   1364 
   1365 /// \brief Merges 32-bit single-precision data values stored in either of the
   1366 ///    two 256-bit vectors of [8 x float], as specified by the immediate
   1367 ///    integer operand.
   1368 ///
   1369 /// \headerfile <x86intrin.h>
   1370 ///
   1371 /// \code
   1372 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
   1373 /// \endcode
   1374 ///
   1375 /// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
   1376 ///
   1377 /// \param V1
   1378 ///    A 256-bit vector of [8 x float].
   1379 /// \param V2
   1380 ///    A 256-bit vector of [8 x float].
   1381 /// \param M
   1382 ///    An immediate integer operand, with mask bits [7:0] specifying how the
   1383 ///    values are to be copied. The position of the mask bit corresponds to the
   1384 ///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
   1385 ///    element in operand \a V1 is copied to the same position in the
   1386 ///    destination. When a mask bit is 1, the corresponding 32-bit element in
   1387 ///    operand \a V2 is copied to the same position in the destination.
   1388 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1389 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
   1390   (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
   1391                                   (__v8sf)(__m256)(V2), \
   1392                                   (((M) & 0x01) ?  8 : 0), \
   1393                                   (((M) & 0x02) ?  9 : 1), \
   1394                                   (((M) & 0x04) ? 10 : 2), \
   1395                                   (((M) & 0x08) ? 11 : 3), \
   1396                                   (((M) & 0x10) ? 12 : 4), \
   1397                                   (((M) & 0x20) ? 13 : 5), \
   1398                                   (((M) & 0x40) ? 14 : 6), \
   1399                                   (((M) & 0x80) ? 15 : 7)); })
   1400 
   1401 /// \brief Merges 64-bit double-precision data values stored in either of the
   1402 ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
   1403 ///    operand.
   1404 ///
   1405 /// \headerfile <x86intrin.h>
   1406 ///
   1407 /// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
   1408 ///
   1409 /// \param __a
   1410 ///    A 256-bit vector of [4 x double].
   1411 /// \param __b
   1412 ///    A 256-bit vector of [4 x double].
   1413 /// \param __c
   1414 ///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
   1415 ///    how the values are to be copied. The position of the mask bit corresponds
   1416 ///    to the most significant bit of a copied value. When a mask bit is 0, the
   1417 ///    corresponding 64-bit element in operand \a __a is copied to the same
   1418 ///    position in the destination. When a mask bit is 1, the corresponding
   1419 ///    64-bit element in operand \a __b is copied to the same position in the
   1420 ///    destination.
   1421 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1422 static __inline __m256d __DEFAULT_FN_ATTRS
   1423 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
   1424 {
   1425   return (__m256d)__builtin_ia32_blendvpd256(
   1426     (__v4df)__a, (__v4df)__b, (__v4df)__c);
   1427 }
   1428 
   1429 /// \brief Merges 32-bit single-precision data values stored in either of the
   1430 ///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
   1431 ///    operand.
   1432 ///
   1433 /// \headerfile <x86intrin.h>
   1434 ///
   1435 /// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
   1436 ///
   1437 /// \param __a
   1438 ///    A 256-bit vector of [8 x float].
   1439 /// \param __b
   1440 ///    A 256-bit vector of [8 x float].
   1441 /// \param __c
   1442 ///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
   1443 ///    and 31 specifying how the values are to be copied. The position of the
   1444 ///    mask bit corresponds to the most significant bit of a copied value. When
   1445 ///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
   1446 ///    copied to the same position in the destination. When a mask bit is 1, the
   1447 ///    corresponding 32-bit element in operand \a __b is copied to the same
   1448 ///    position in the destination.
   1449 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1450 static __inline __m256 __DEFAULT_FN_ATTRS
   1451 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
   1452 {
   1453   return (__m256)__builtin_ia32_blendvps256(
   1454     (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
   1455 }
   1456 
   1457 /* Vector Dot Product */
   1458 /// \brief Computes two dot products in parallel, using the lower and upper
   1459 ///    halves of two [8 x float] vectors as input to the two computations, and
   1460 ///    returning the two dot products in the lower and upper halves of the
   1461 ///    [8 x float] result. The immediate integer operand controls which input
   1462 ///    elements will contribute to the dot product, and where the final results
   1463 ///    are returned. In general, for each dot product, the four corresponding
   1464 ///    elements of the input vectors are multiplied; the first two and second
   1465 ///    two products are summed, then the two sums are added to form the final
   1466 ///    result.
   1467 ///
   1468 /// \headerfile <x86intrin.h>
   1469 ///
   1470 /// \code
   1471 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
   1472 /// \endcode
   1473 ///
   1474 /// This intrinsic corresponds to the <c> VDPPS </c> instruction.
   1475 ///
   1476 /// \param V1
   1477 ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
   1478 /// \param V2
   1479 ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
   1480 /// \param M
   1481 ///    An immediate integer argument. Bits [7:4] determine which elements of
   1482 ///    the input vectors are used, with bit [4] corresponding to the lowest
   1483 ///    element and bit [7] corresponding to the highest element of each [4 x
   1484 ///    float] subvector. If a bit is set, the corresponding elements from the
   1485 ///    two input vectors are used as an input for dot product; otherwise that
   1486 ///    input is treated as zero. Bits [3:0] determine which elements of the
   1487 ///    result will receive a copy of the final dot product, with bit [0]
   1488 ///    corresponding to the lowest element and bit [3] corresponding to the
   1489 ///    highest element of each [4 x float] subvector. If a bit is set, the dot
   1490 ///    product is returned in the corresponding element; otherwise that element
   1491 ///    is set to zero. The bitmask is applied in the same way to each of the
   1492 ///    two parallel dot product computations.
   1493 /// \returns A 256-bit vector of [8 x float] containing the two dot products.
   1494 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
   1495   (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
   1496                                  (__v8sf)(__m256)(V2), (M)); })
   1497 
   1498 /* Vector shuffle */
   1499 /// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
   1500 ///    specified by the immediate value operand. The four selected elements in
   1501 ///    each operand are copied to the destination according to the bits
   1502 ///    specified in the immediate operand. The selected elements from the first
   1503 ///    256-bit operand are copied to bits [63:0] and bits [191:128] of the
   1504 ///    destination, and the selected elements from the second 256-bit operand
   1505 ///    are copied to bits [127:64] and bits [255:192] of the destination. For
   1506 ///    example, if bits [7:0] of the immediate operand contain a value of 0xFF,
   1507 ///    the 256-bit destination vector would contain the following values: b[7],
   1508 ///    b[7], a[7], a[7], b[3], b[3], a[3], a[3].
   1509 ///
   1510 /// \headerfile <x86intrin.h>
   1511 ///
   1512 /// \code
   1513 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
   1514 /// \endcode
   1515 ///
   1516 /// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
   1517 ///
   1518 /// \param a
   1519 ///    A 256-bit vector of [8 x float]. The four selected elements in this
   1520 ///    operand are copied to bits [63:0] and bits [191:128] in the destination,
   1521 ///    according to the bits specified in the immediate operand.
   1522 /// \param b
   1523 ///    A 256-bit vector of [8 x float]. The four selected elements in this
   1524 ///    operand are copied to bits [127:64] and bits [255:192] in the
   1525 ///    destination, according to the bits specified in the immediate operand.
   1526 /// \param mask
   1527 ///    An immediate value containing an 8-bit value specifying which elements to
   1528 ///    copy from \a a and \a b \n.
   1529 ///    Bits [3:0] specify the values copied from operand \a a. \n
   1530 ///    Bits [7:4] specify the values copied from operand \a b. \n
   1531 ///    The destinations within the 256-bit destination are assigned values as
   1532 ///    follows, according to the bit value assignments described below: \n
   1533 ///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
   1534 ///    destination. \n
   1535 ///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
   1536 ///    destination. \n
   1537 ///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
   1538 ///    destination. \n
   1539 ///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
   1540 ///    the destination. \n
   1541 ///    Bit value assignments: \n
   1542 ///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
   1543 ///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
   1544 ///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
   1545 ///    11: Bits [127:96] and [255:224] are copied from the selected operand.
   1546 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
   1547 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
   1548   (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
   1549                                   (__v8sf)(__m256)(b), \
   1550                                   0  + (((mask) >> 0) & 0x3), \
   1551                                   0  + (((mask) >> 2) & 0x3), \
   1552                                   8  + (((mask) >> 4) & 0x3), \
   1553                                   8  + (((mask) >> 6) & 0x3), \
   1554                                   4  + (((mask) >> 0) & 0x3), \
   1555                                   4  + (((mask) >> 2) & 0x3), \
   1556                                   12 + (((mask) >> 4) & 0x3), \
   1557                                   12 + (((mask) >> 6) & 0x3)); })
   1558 
   1559 /// \brief Selects four double-precision values from the 256-bit operands of
   1560 ///    [4 x double], as specified by the immediate value operand. The selected
   1561 ///    elements from the first 256-bit operand are copied to bits [63:0] and
   1562 ///    bits [191:128] in the destination, and the selected elements from the
   1563 ///    second 256-bit operand are copied to bits [127:64] and bits [255:192] in
   1564 ///    the destination. For example, if bits [3:0] of the immediate operand
   1565 ///    contain a value of 0xF, the 256-bit destination vector would contain the
   1566 ///    following values: b[3], a[3], b[1], a[1].
   1567 ///
   1568 /// \headerfile <x86intrin.h>
   1569 ///
   1570 /// \code
   1571 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
   1572 /// \endcode
   1573 ///
   1574 /// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
   1575 ///
   1576 /// \param a
   1577 ///    A 256-bit vector of [4 x double].
   1578 /// \param b
   1579 ///    A 256-bit vector of [4 x double].
   1580 /// \param mask
   1581 ///    An immediate value containing 8-bit values specifying which elements to
   1582 ///    copy from \a a and \a b: \n
   1583 ///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
   1584 ///    destination. \n
   1585 ///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
   1586 ///    destination. \n
   1587 ///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
   1588 ///    destination. \n
   1589 ///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
   1590 ///    destination. \n
   1591 ///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
   1592 ///    destination. \n
   1593 ///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
   1594 ///    destination. \n
   1595 ///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
   1596 ///    destination. \n
   1597 ///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
   1598 ///    destination.
   1599 /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
   1600 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
   1601   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
   1602                                    (__v4df)(__m256d)(b), \
   1603                                    0 + (((mask) >> 0) & 0x1), \
   1604                                    4 + (((mask) >> 1) & 0x1), \
   1605                                    2 + (((mask) >> 2) & 0x1), \
   1606                                    6 + (((mask) >> 3) & 0x1)); })
   1607 
   1608 /* Compare */
   1609 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
   1610 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
   1611 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
   1612 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
   1613 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
   1614 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
   1615 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
   1616 #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
   1617 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
   1618 #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
   1619 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
   1620 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
   1621 #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
   1622 #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
   1623 #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
   1624 #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
   1625 #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
   1626 #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
   1627 #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
   1628 #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
   1629 #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
   1630 #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
   1631 #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
   1632 #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
   1633 #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
   1634 #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
   1635 #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
   1636 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
   1637 #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
   1638 #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
   1639 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
   1640 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
   1641 
   1642 /// \brief Compares each of the corresponding double-precision values of two
   1643 ///    128-bit vectors of [2 x double], using the operation specified by the
   1644 ///    immediate integer operand. Returns a [2 x double] vector consisting of
   1645 ///    two doubles corresponding to the two comparison results: zero if the
   1646 ///    comparison is false, and all 1's if the comparison is true.
   1647 ///
   1648 /// \headerfile <x86intrin.h>
   1649 ///
   1650 /// \code
   1651 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
   1652 /// \endcode
   1653 ///
   1654 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
   1655 ///
   1656 /// \param a
   1657 ///    A 128-bit vector of [2 x double].
   1658 /// \param b
   1659 ///    A 128-bit vector of [2 x double].
   1660 /// \param c
   1661 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1662 ///    operation to use: \n
   1663 ///    0x00 : Equal (ordered, non-signaling)
   1664 ///    0x01 : Less-than (ordered, signaling)
   1665 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1666 ///    0x03 : Unordered (non-signaling)
   1667 ///    0x04 : Not-equal (unordered, non-signaling)
   1668 ///    0x05 : Not-less-than (unordered, signaling)
   1669 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1670 ///    0x07 : Ordered (non-signaling)
   1671 ///    0x08 : Equal (unordered, non-signaling)
   1672 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1673 ///    0x0a : Not-greater-than (unordered, signaling)
   1674 ///    0x0b : False (ordered, non-signaling)
   1675 ///    0x0c : Not-equal (ordered, non-signaling)
   1676 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1677 ///    0x0e : Greater-than (ordered, signaling)
   1678 ///    0x0f : True (unordered, non-signaling)
   1679 ///    0x10 : Equal (ordered, signaling)
   1680 ///    0x11 : Less-than (ordered, non-signaling)
   1681 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1682 ///    0x13 : Unordered (signaling)
   1683 ///    0x14 : Not-equal (unordered, signaling)
   1684 ///    0x15 : Not-less-than (unordered, non-signaling)
   1685 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1686 ///    0x17 : Ordered (signaling)
   1687 ///    0x18 : Equal (unordered, signaling)
   1688 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1689 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1690 ///    0x1b : False (ordered, signaling)
   1691 ///    0x1c : Not-equal (ordered, signaling)
   1692 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1693 ///    0x1e : Greater-than (ordered, non-signaling)
   1694 ///    0x1f : True (unordered, signaling)
   1695 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
   1696 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
   1697   (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
   1698                                 (__v2df)(__m128d)(b), (c)); })
   1699 
   1700 /// \brief Compares each of the corresponding values of two 128-bit vectors of
   1701 ///    [4 x float], using the operation specified by the immediate integer
   1702 ///    operand. Returns a [4 x float] vector consisting of four floats
   1703 ///    corresponding to the four comparison results: zero if the comparison is
   1704 ///    false, and all 1's if the comparison is true.
   1705 ///
   1706 /// \headerfile <x86intrin.h>
   1707 ///
   1708 /// \code
   1709 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
   1710 /// \endcode
   1711 ///
   1712 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
   1713 ///
   1714 /// \param a
   1715 ///    A 128-bit vector of [4 x float].
   1716 /// \param b
   1717 ///    A 128-bit vector of [4 x float].
   1718 /// \param c
   1719 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1720 ///    operation to use: \n
   1721 ///    0x00 : Equal (ordered, non-signaling)
   1722 ///    0x01 : Less-than (ordered, signaling)
   1723 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1724 ///    0x03 : Unordered (non-signaling)
   1725 ///    0x04 : Not-equal (unordered, non-signaling)
   1726 ///    0x05 : Not-less-than (unordered, signaling)
   1727 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1728 ///    0x07 : Ordered (non-signaling)
   1729 ///    0x08 : Equal (unordered, non-signaling)
   1730 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1731 ///    0x0a : Not-greater-than (unordered, signaling)
   1732 ///    0x0b : False (ordered, non-signaling)
   1733 ///    0x0c : Not-equal (ordered, non-signaling)
   1734 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1735 ///    0x0e : Greater-than (ordered, signaling)
   1736 ///    0x0f : True (unordered, non-signaling)
   1737 ///    0x10 : Equal (ordered, signaling)
   1738 ///    0x11 : Less-than (ordered, non-signaling)
   1739 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1740 ///    0x13 : Unordered (signaling)
   1741 ///    0x14 : Not-equal (unordered, signaling)
   1742 ///    0x15 : Not-less-than (unordered, non-signaling)
   1743 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1744 ///    0x17 : Ordered (signaling)
   1745 ///    0x18 : Equal (unordered, signaling)
   1746 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1747 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1748 ///    0x1b : False (ordered, signaling)
   1749 ///    0x1c : Not-equal (ordered, signaling)
   1750 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1751 ///    0x1e : Greater-than (ordered, non-signaling)
   1752 ///    0x1f : True (unordered, signaling)
   1753 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   1754 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
   1755   (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
   1756                                (__v4sf)(__m128)(b), (c)); })
   1757 
   1758 /// \brief Compares each of the corresponding double-precision values of two
   1759 ///    256-bit vectors of [4 x double], using the operation specified by the
   1760 ///    immediate integer operand. Returns a [4 x double] vector consisting of
   1761 ///    four doubles corresponding to the four comparison results: zero if the
   1762 ///    comparison is false, and all 1's if the comparison is true.
   1763 ///
   1764 /// \headerfile <x86intrin.h>
   1765 ///
   1766 /// \code
   1767 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
   1768 /// \endcode
   1769 ///
   1770 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
   1771 ///
   1772 /// \param a
   1773 ///    A 256-bit vector of [4 x double].
   1774 /// \param b
   1775 ///    A 256-bit vector of [4 x double].
   1776 /// \param c
   1777 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1778 ///    operation to use: \n
   1779 ///    0x00 : Equal (ordered, non-signaling)
   1780 ///    0x01 : Less-than (ordered, signaling)
   1781 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1782 ///    0x03 : Unordered (non-signaling)
   1783 ///    0x04 : Not-equal (unordered, non-signaling)
   1784 ///    0x05 : Not-less-than (unordered, signaling)
   1785 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1786 ///    0x07 : Ordered (non-signaling)
   1787 ///    0x08 : Equal (unordered, non-signaling)
   1788 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1789 ///    0x0a : Not-greater-than (unordered, signaling)
   1790 ///    0x0b : False (ordered, non-signaling)
   1791 ///    0x0c : Not-equal (ordered, non-signaling)
   1792 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1793 ///    0x0e : Greater-than (ordered, signaling)
   1794 ///    0x0f : True (unordered, non-signaling)
   1795 ///    0x10 : Equal (ordered, signaling)
   1796 ///    0x11 : Less-than (ordered, non-signaling)
   1797 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1798 ///    0x13 : Unordered (signaling)
   1799 ///    0x14 : Not-equal (unordered, signaling)
   1800 ///    0x15 : Not-less-than (unordered, non-signaling)
   1801 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1802 ///    0x17 : Ordered (signaling)
   1803 ///    0x18 : Equal (unordered, signaling)
   1804 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1805 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1806 ///    0x1b : False (ordered, signaling)
   1807 ///    0x1c : Not-equal (ordered, signaling)
   1808 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1809 ///    0x1e : Greater-than (ordered, non-signaling)
   1810 ///    0x1f : True (unordered, signaling)
   1811 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
   1812 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
   1813   (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
   1814                                    (__v4df)(__m256d)(b), (c)); })
   1815 
   1816 /// \brief Compares each of the corresponding values of two 256-bit vectors of
   1817 ///    [8 x float], using the operation specified by the immediate integer
   1818 ///    operand. Returns a [8 x float] vector consisting of eight floats
   1819 ///    corresponding to the eight comparison results: zero if the comparison is
   1820 ///    false, and all 1's if the comparison is true.
   1821 ///
   1822 /// \headerfile <x86intrin.h>
   1823 ///
   1824 /// \code
   1825 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
   1826 /// \endcode
   1827 ///
   1828 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
   1829 ///
   1830 /// \param a
   1831 ///    A 256-bit vector of [8 x float].
   1832 /// \param b
   1833 ///    A 256-bit vector of [8 x float].
   1834 /// \param c
   1835 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1836 ///    operation to use: \n
   1837 ///    0x00 : Equal (ordered, non-signaling)
   1838 ///    0x01 : Less-than (ordered, signaling)
   1839 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1840 ///    0x03 : Unordered (non-signaling)
   1841 ///    0x04 : Not-equal (unordered, non-signaling)
   1842 ///    0x05 : Not-less-than (unordered, signaling)
   1843 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1844 ///    0x07 : Ordered (non-signaling)
   1845 ///    0x08 : Equal (unordered, non-signaling)
   1846 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1847 ///    0x0a : Not-greater-than (unordered, signaling)
   1848 ///    0x0b : False (ordered, non-signaling)
   1849 ///    0x0c : Not-equal (ordered, non-signaling)
   1850 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1851 ///    0x0e : Greater-than (ordered, signaling)
   1852 ///    0x0f : True (unordered, non-signaling)
   1853 ///    0x10 : Equal (ordered, signaling)
   1854 ///    0x11 : Less-than (ordered, non-signaling)
   1855 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1856 ///    0x13 : Unordered (signaling)
   1857 ///    0x14 : Not-equal (unordered, signaling)
   1858 ///    0x15 : Not-less-than (unordered, non-signaling)
   1859 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1860 ///    0x17 : Ordered (signaling)
   1861 ///    0x18 : Equal (unordered, signaling)
   1862 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1863 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1864 ///    0x1b : False (ordered, signaling)
   1865 ///    0x1c : Not-equal (ordered, signaling)
   1866 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1867 ///    0x1e : Greater-than (ordered, non-signaling)
   1868 ///    0x1f : True (unordered, signaling)
   1869 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
   1870 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
   1871   (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
   1872                                   (__v8sf)(__m256)(b), (c)); })
   1873 
   1874 /// \brief Compares each of the corresponding scalar double-precision values of
   1875 ///    two 128-bit vectors of [2 x double], using the operation specified by the
   1876 ///    immediate integer operand. If the result is true, all 64 bits of the
   1877 ///    destination vector are set; otherwise they are cleared.
   1878 ///
   1879 /// \headerfile <x86intrin.h>
   1880 ///
   1881 /// \code
   1882 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
   1883 /// \endcode
   1884 ///
   1885 /// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
   1886 ///
   1887 /// \param a
   1888 ///    A 128-bit vector of [2 x double].
   1889 /// \param b
   1890 ///    A 128-bit vector of [2 x double].
   1891 /// \param c
   1892 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1893 ///    operation to use: \n
   1894 ///    0x00 : Equal (ordered, non-signaling)
   1895 ///    0x01 : Less-than (ordered, signaling)
   1896 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1897 ///    0x03 : Unordered (non-signaling)
   1898 ///    0x04 : Not-equal (unordered, non-signaling)
   1899 ///    0x05 : Not-less-than (unordered, signaling)
   1900 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1901 ///    0x07 : Ordered (non-signaling)
   1902 ///    0x08 : Equal (unordered, non-signaling)
   1903 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1904 ///    0x0a : Not-greater-than (unordered, signaling)
   1905 ///    0x0b : False (ordered, non-signaling)
   1906 ///    0x0c : Not-equal (ordered, non-signaling)
   1907 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1908 ///    0x0e : Greater-than (ordered, signaling)
   1909 ///    0x0f : True (unordered, non-signaling)
   1910 ///    0x10 : Equal (ordered, signaling)
   1911 ///    0x11 : Less-than (ordered, non-signaling)
   1912 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1913 ///    0x13 : Unordered (signaling)
   1914 ///    0x14 : Not-equal (unordered, signaling)
   1915 ///    0x15 : Not-less-than (unordered, non-signaling)
   1916 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1917 ///    0x17 : Ordered (signaling)
   1918 ///    0x18 : Equal (unordered, signaling)
   1919 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1920 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1921 ///    0x1b : False (ordered, signaling)
   1922 ///    0x1c : Not-equal (ordered, signaling)
   1923 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1924 ///    0x1e : Greater-than (ordered, non-signaling)
   1925 ///    0x1f : True (unordered, signaling)
   1926 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
   1927 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
   1928   (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
   1929                                 (__v2df)(__m128d)(b), (c)); })
   1930 
   1931 /// \brief Compares each of the corresponding scalar values of two 128-bit
   1932 ///    vectors of [4 x float], using the operation specified by the immediate
   1933 ///    integer operand. If the result is true, all 32 bits of the destination
   1934 ///    vector are set; otherwise they are cleared.
   1935 ///
   1936 /// \headerfile <x86intrin.h>
   1937 ///
   1938 /// \code
   1939 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
   1940 /// \endcode
   1941 ///
   1942 /// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
   1943 ///
   1944 /// \param a
   1945 ///    A 128-bit vector of [4 x float].
   1946 /// \param b
   1947 ///    A 128-bit vector of [4 x float].
   1948 /// \param c
   1949 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1950 ///    operation to use: \n
   1951 ///    0x00 : Equal (ordered, non-signaling)
   1952 ///    0x01 : Less-than (ordered, signaling)
   1953 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1954 ///    0x03 : Unordered (non-signaling)
   1955 ///    0x04 : Not-equal (unordered, non-signaling)
   1956 ///    0x05 : Not-less-than (unordered, signaling)
   1957 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1958 ///    0x07 : Ordered (non-signaling)
   1959 ///    0x08 : Equal (unordered, non-signaling)
   1960 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1961 ///    0x0a : Not-greater-than (unordered, signaling)
   1962 ///    0x0b : False (ordered, non-signaling)
   1963 ///    0x0c : Not-equal (ordered, non-signaling)
   1964 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1965 ///    0x0e : Greater-than (ordered, signaling)
   1966 ///    0x0f : True (unordered, non-signaling)
   1967 ///    0x10 : Equal (ordered, signaling)
   1968 ///    0x11 : Less-than (ordered, non-signaling)
   1969 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1970 ///    0x13 : Unordered (signaling)
   1971 ///    0x14 : Not-equal (unordered, signaling)
   1972 ///    0x15 : Not-less-than (unordered, non-signaling)
   1973 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1974 ///    0x17 : Ordered (signaling)
   1975 ///    0x18 : Equal (unordered, signaling)
   1976 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1977 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1978 ///    0x1b : False (ordered, signaling)
   1979 ///    0x1c : Not-equal (ordered, signaling)
   1980 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1981 ///    0x1e : Greater-than (ordered, non-signaling)
   1982 ///    0x1f : True (unordered, signaling)
   1983 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   1984 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
   1985   (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
   1986                                (__v4sf)(__m128)(b), (c)); })
   1987 
   1988 /// \brief Takes a [8 x i32] vector and returns the vector element value
   1989 ///    indexed by the immediate constant operand.
   1990 ///
   1991 /// \headerfile <x86intrin.h>
   1992 ///
   1993 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   1994 ///   instruction.
   1995 ///
   1996 /// \param __a
   1997 ///    A 256-bit vector of [8 x i32].
   1998 /// \param __imm
   1999 ///    An immediate integer operand with bits [2:0] determining which vector
   2000 ///    element is extracted and returned.
   2001 /// \returns A 32-bit integer containing the extracted 32 bits of extended
   2002 ///    packed data.
   2003 static __inline int __DEFAULT_FN_ATTRS
   2004 _mm256_extract_epi32(__m256i __a, const int __imm)
   2005 {
   2006   __v8si __b = (__v8si)__a;
   2007   return __b[__imm & 7];
   2008 }
   2009 
   2010 /// \brief Takes a [16 x i16] vector and returns the vector element value
   2011 ///    indexed by the immediate constant operand.
   2012 ///
   2013 /// \headerfile <x86intrin.h>
   2014 ///
   2015 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   2016 ///   instruction.
   2017 ///
   2018 /// \param __a
   2019 ///    A 256-bit integer vector of [16 x i16].
   2020 /// \param __imm
   2021 ///    An immediate integer operand with bits [3:0] determining which vector
   2022 ///    element is extracted and returned.
   2023 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
   2024 ///    packed data.
   2025 static __inline int __DEFAULT_FN_ATTRS
   2026 _mm256_extract_epi16(__m256i __a, const int __imm)
   2027 {
   2028   __v16hi __b = (__v16hi)__a;
   2029   return (unsigned short)__b[__imm & 15];
   2030 }
   2031 
   2032 /// \brief Takes a [32 x i8] vector and returns the vector element value
   2033 ///    indexed by the immediate constant operand.
   2034 ///
   2035 /// \headerfile <x86intrin.h>
   2036 ///
   2037 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   2038 ///   instruction.
   2039 ///
   2040 /// \param __a
   2041 ///    A 256-bit integer vector of [32 x i8].
   2042 /// \param __imm
   2043 ///    An immediate integer operand with bits [4:0] determining which vector
   2044 ///    element is extracted and returned.
   2045 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
   2046 ///    packed data.
   2047 static __inline int __DEFAULT_FN_ATTRS
   2048 _mm256_extract_epi8(__m256i __a, const int __imm)
   2049 {
   2050   __v32qi __b = (__v32qi)__a;
   2051   return (unsigned char)__b[__imm & 31];
   2052 }
   2053 
   2054 #ifdef __x86_64__
   2055 /// \brief Takes a [4 x i64] vector and returns the vector element value
   2056 ///    indexed by the immediate constant operand.
   2057 ///
   2058 /// \headerfile <x86intrin.h>
   2059 ///
   2060 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   2061 ///   instruction.
   2062 ///
   2063 /// \param __a
   2064 ///    A 256-bit integer vector of [4 x i64].
   2065 /// \param __imm
   2066 ///    An immediate integer operand with bits [1:0] determining which vector
   2067 ///    element is extracted and returned.
   2068 /// \returns A 64-bit integer containing the extracted 64 bits of extended
   2069 ///    packed data.
   2070 static __inline long long  __DEFAULT_FN_ATTRS
   2071 _mm256_extract_epi64(__m256i __a, const int __imm)
   2072 {
   2073   __v4di __b = (__v4di)__a;
   2074   return __b[__imm & 3];
   2075 }
   2076 #endif
   2077 
   2078 /// \brief Takes a [8 x i32] vector and replaces the vector element value
   2079 ///    indexed by the immediate constant operand by a new value. Returns the
   2080 ///    modified vector.
   2081 ///
   2082 /// \headerfile <x86intrin.h>
   2083 ///
   2084 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2085 ///   instruction.
   2086 ///
   2087 /// \param __a
   2088 ///    A vector of [8 x i32] to be used by the insert operation.
   2089 /// \param __b
   2090 ///    An integer value. The replacement value for the insert operation.
   2091 /// \param __imm
   2092 ///    An immediate integer specifying the index of the vector element to be
   2093 ///    replaced.
   2094 /// \returns A copy of vector \a __a, after replacing its element indexed by
   2095 ///    \a __imm with \a __b.
   2096 static __inline __m256i __DEFAULT_FN_ATTRS
   2097 _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
   2098 {
   2099   __v8si __c = (__v8si)__a;
   2100   __c[__imm & 7] = __b;
   2101   return (__m256i)__c;
   2102 }
   2103 
   2104 
   2105 /// \brief Takes a [16 x i16] vector and replaces the vector element value
   2106 ///    indexed by the immediate constant operand with a new value. Returns the
   2107 ///    modified vector.
   2108 ///
   2109 /// \headerfile <x86intrin.h>
   2110 ///
   2111 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2112 ///   instruction.
   2113 ///
   2114 /// \param __a
   2115 ///    A vector of [16 x i16] to be used by the insert operation.
   2116 /// \param __b
   2117 ///    An i16 integer value. The replacement value for the insert operation.
   2118 /// \param __imm
   2119 ///    An immediate integer specifying the index of the vector element to be
   2120 ///    replaced.
   2121 /// \returns A copy of vector \a __a, after replacing its element indexed by
   2122 ///    \a __imm with \a __b.
   2123 static __inline __m256i __DEFAULT_FN_ATTRS
   2124 _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
   2125 {
   2126   __v16hi __c = (__v16hi)__a;
   2127   __c[__imm & 15] = __b;
   2128   return (__m256i)__c;
   2129 }
   2130 
   2131 /// \brief Takes a [32 x i8] vector and replaces the vector element value
   2132 ///    indexed by the immediate constant operand with a new value. Returns the
   2133 ///    modified vector.
   2134 ///
   2135 /// \headerfile <x86intrin.h>
   2136 ///
   2137 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2138 ///   instruction.
   2139 ///
   2140 /// \param __a
   2141 ///    A vector of [32 x i8] to be used by the insert operation.
   2142 /// \param __b
   2143 ///    An i8 integer value. The replacement value for the insert operation.
   2144 /// \param __imm
   2145 ///    An immediate integer specifying the index of the vector element to be
   2146 ///    replaced.
   2147 /// \returns A copy of vector \a __a, after replacing its element indexed by
   2148 ///    \a __imm with \a __b.
   2149 static __inline __m256i __DEFAULT_FN_ATTRS
   2150 _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
   2151 {
   2152   __v32qi __c = (__v32qi)__a;
   2153   __c[__imm & 31] = __b;
   2154   return (__m256i)__c;
   2155 }
   2156 
   2157 #ifdef __x86_64__
   2158 /// \brief Takes a [4 x i64] vector and replaces the vector element value
   2159 ///    indexed by the immediate constant operand with a new value. Returns the
   2160 ///    modified vector.
   2161 ///
   2162 /// \headerfile <x86intrin.h>
   2163 ///
   2164 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2165 ///   instruction.
   2166 ///
   2167 /// \param __a
   2168 ///    A vector of [4 x i64] to be used by the insert operation.
   2169 /// \param __b
   2170 ///    A 64-bit integer value. The replacement value for the insert operation.
   2171 /// \param __imm
   2172 ///    An immediate integer specifying the index of the vector element to be
   2173 ///    replaced.
   2174 /// \returns A copy of vector \a __a, after replacing its element indexed by
   2175 ///     \a __imm with \a __b.
   2176 static __inline __m256i __DEFAULT_FN_ATTRS
   2177 _mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
   2178 {
   2179   __v4di __c = (__v4di)__a;
   2180   __c[__imm & 3] = __b;
   2181   return (__m256i)__c;
   2182 }
   2183 #endif
   2184 
   2185 /* Conversion */
   2186 /// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
   2187 ///
   2188 /// \headerfile <x86intrin.h>
   2189 ///
   2190 /// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
   2191 ///
   2192 /// \param __a
   2193 ///    A 128-bit integer vector of [4 x i32].
   2194 /// \returns A 256-bit vector of [4 x double] containing the converted values.
   2195 static __inline __m256d __DEFAULT_FN_ATTRS
   2196 _mm256_cvtepi32_pd(__m128i __a)
   2197 {
   2198   return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
   2199 }
   2200 
   2201 /// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
   2202 ///
   2203 /// \headerfile <x86intrin.h>
   2204 ///
   2205 /// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
   2206 ///
   2207 /// \param __a
   2208 ///    A 256-bit integer vector.
   2209 /// \returns A 256-bit vector of [8 x float] containing the converted values.
   2210 static __inline __m256 __DEFAULT_FN_ATTRS
   2211 _mm256_cvtepi32_ps(__m256i __a)
   2212 {
   2213   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
   2214 }
   2215 
   2216 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
   2217 ///    [4 x float].
   2218 ///
   2219 /// \headerfile <x86intrin.h>
   2220 ///
   2221 /// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
   2222 ///
   2223 /// \param __a
   2224 ///    A 256-bit vector of [4 x double].
   2225 /// \returns A 128-bit vector of [4 x float] containing the converted values.
   2226 static __inline __m128 __DEFAULT_FN_ATTRS
   2227 _mm256_cvtpd_ps(__m256d __a)
   2228 {
   2229   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
   2230 }
   2231 
   2232 /// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
   2233 ///
   2234 /// \headerfile <x86intrin.h>
   2235 ///
   2236 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
   2237 ///
   2238 /// \param __a
   2239 ///    A 256-bit vector of [8 x float].
   2240 /// \returns A 256-bit integer vector containing the converted values.
   2241 static __inline __m256i __DEFAULT_FN_ATTRS
   2242 _mm256_cvtps_epi32(__m256 __a)
   2243 {
   2244   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
   2245 }
   2246 
   2247 /// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
   2248 ///    x double].
   2249 ///
   2250 /// \headerfile <x86intrin.h>
   2251 ///
   2252 /// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
   2253 ///
   2254 /// \param __a
   2255 ///    A 128-bit vector of [4 x float].
   2256 /// \returns A 256-bit vector of [4 x double] containing the converted values.
   2257 static __inline __m256d __DEFAULT_FN_ATTRS
   2258 _mm256_cvtps_pd(__m128 __a)
   2259 {
   2260   return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
   2261 }
   2262 
   2263 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
   2264 ///    x i32], truncating the result by rounding towards zero when it is
   2265 ///    inexact.
   2266 ///
   2267 /// \headerfile <x86intrin.h>
   2268 ///
   2269 /// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
   2270 ///
   2271 /// \param __a
   2272 ///    A 256-bit vector of [4 x double].
   2273 /// \returns A 128-bit integer vector containing the converted values.
   2274 static __inline __m128i __DEFAULT_FN_ATTRS
   2275 _mm256_cvttpd_epi32(__m256d __a)
   2276 {
   2277   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
   2278 }
   2279 
   2280 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
   2281 ///    x i32]. When a conversion is inexact, the value returned is rounded
   2282 ///    according to the rounding control bits in the MXCSR register.
   2283 ///
   2284 /// \headerfile <x86intrin.h>
   2285 ///
   2286 /// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
   2287 ///
   2288 /// \param __a
   2289 ///    A 256-bit vector of [4 x double].
   2290 /// \returns A 128-bit integer vector containing the converted values.
   2291 static __inline __m128i __DEFAULT_FN_ATTRS
   2292 _mm256_cvtpd_epi32(__m256d __a)
   2293 {
   2294   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
   2295 }
   2296 
   2297 /// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
   2298 ///    truncating the result by rounding towards zero when it is inexact.
   2299 ///
   2300 /// \headerfile <x86intrin.h>
   2301 ///
   2302 /// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
   2303 ///
   2304 /// \param __a
   2305 ///    A 256-bit vector of [8 x float].
   2306 /// \returns A 256-bit integer vector containing the converted values.
   2307 static __inline __m256i __DEFAULT_FN_ATTRS
   2308 _mm256_cvttps_epi32(__m256 __a)
   2309 {
   2310   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
   2311 }
   2312 
   2313 /// \brief Returns the first element of the input vector of [4 x double].
   2314 ///
   2315 /// \headerfile <avxintrin.h>
   2316 ///
   2317 /// This intrinsic is a utility function and does not correspond to a specific
   2318 ///    instruction.
   2319 ///
   2320 /// \param __a
   2321 ///    A 256-bit vector of [4 x double].
   2322 /// \returns A 64 bit double containing the first element of the input vector.
   2323 static __inline double __DEFAULT_FN_ATTRS
   2324 _mm256_cvtsd_f64(__m256d __a)
   2325 {
   2326  return __a[0];
   2327 }
   2328 
   2329 /// \brief Returns the first element of the input vector of [8 x i32].
   2330 ///
   2331 /// \headerfile <avxintrin.h>
   2332 ///
   2333 /// This intrinsic is a utility function and does not correspond to a specific
   2334 ///    instruction.
   2335 ///
   2336 /// \param __a
   2337 ///    A 256-bit vector of [8 x i32].
   2338 /// \returns A 32 bit integer containing the first element of the input vector.
   2339 static __inline int __DEFAULT_FN_ATTRS
   2340 _mm256_cvtsi256_si32(__m256i __a)
   2341 {
   2342  __v8si __b = (__v8si)__a;
   2343  return __b[0];
   2344 }
   2345 
   2346 /// \brief Returns the first element of the input vector of [8 x float].
   2347 ///
   2348 /// \headerfile <avxintrin.h>
   2349 ///
   2350 /// This intrinsic is a utility function and does not correspond to a specific
   2351 ///    instruction.
   2352 ///
   2353 /// \param __a
   2354 ///    A 256-bit vector of [8 x float].
   2355 /// \returns A 32 bit float containing the first element of the input vector.
   2356 static __inline float __DEFAULT_FN_ATTRS
   2357 _mm256_cvtss_f32(__m256 __a)
   2358 {
   2359  return __a[0];
   2360 }
   2361 
   2362 /* Vector replicate */
   2363 /// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
   2364 ///    vector of [8 x float] to float values in a 256-bit vector of
   2365 ///    [8 x float].
   2366 ///
   2367 /// \headerfile <x86intrin.h>
   2368 ///
   2369 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
   2370 ///
   2371 /// \param __a
   2372 ///    A 256-bit vector of [8 x float]. \n
   2373 ///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
   2374 ///    the return value. \n
   2375 ///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
   2376 ///    the return value. \n
   2377 ///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
   2378 ///    return value. \n
   2379 ///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
   2380 ///    return value.
   2381 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
   2382 ///    values.
   2383 static __inline __m256 __DEFAULT_FN_ATTRS
   2384 _mm256_movehdup_ps(__m256 __a)
   2385 {
   2386   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
   2387 }
   2388 
   2389 /// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
   2390 ///    vector of [8 x float] to float values in a 256-bit vector of [8 x float].
   2391 ///
   2392 /// \headerfile <x86intrin.h>
   2393 ///
   2394 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
   2395 ///
   2396 /// \param __a
   2397 ///    A 256-bit vector of [8 x float]. \n
   2398 ///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
   2399 ///    the return value. \n
   2400 ///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
   2401 ///    the return value. \n
   2402 ///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
   2403 ///    return value. \n
   2404 ///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
   2405 ///    return value.
   2406 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
   2407 ///    values.
   2408 static __inline __m256 __DEFAULT_FN_ATTRS
   2409 _mm256_moveldup_ps(__m256 __a)
   2410 {
   2411   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
   2412 }
   2413 
   2414 /// \brief Moves and duplicates double-precision floating point values from a
   2415 ///    256-bit vector of [4 x double] to double-precision values in a 256-bit
   2416 ///    vector of [4 x double].
   2417 ///
   2418 /// \headerfile <x86intrin.h>
   2419 ///
   2420 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
   2421 ///
   2422 /// \param __a
   2423 ///    A 256-bit vector of [4 x double]. \n
   2424 ///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
   2425 ///    return value. \n
   2426 ///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
   2427 ///    the return value.
   2428 /// \returns A 256-bit vector of [4 x double] containing the moved and
   2429 ///    duplicated values.
   2430 static __inline __m256d __DEFAULT_FN_ATTRS
   2431 _mm256_movedup_pd(__m256d __a)
   2432 {
   2433   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
   2434 }
   2435 
   2436 /* Unpack and Interleave */
   2437 /// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
   2438 ///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
   2439 ///
   2440 /// \headerfile <x86intrin.h>
   2441 ///
   2442 /// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
   2443 ///
   2444 /// \param __a
   2445 ///    A 256-bit floating-point vector of [4 x double]. \n
   2446 ///    Bits [127:64] are written to bits [63:0] of the return value. \n
   2447 ///    Bits [255:192] are written to bits [191:128] of the return value. \n
   2448 /// \param __b
   2449 ///    A 256-bit floating-point vector of [4 x double]. \n
   2450 ///    Bits [127:64] are written to bits [127:64] of the return value. \n
   2451 ///    Bits [255:192] are written to bits [255:192] of the return value. \n
   2452 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
   2453 static __inline __m256d __DEFAULT_FN_ATTRS
   2454 _mm256_unpackhi_pd(__m256d __a, __m256d __b)
   2455 {
   2456   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
   2457 }
   2458 
   2459 /// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
   2460 ///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
   2461 ///
   2462 /// \headerfile <x86intrin.h>
   2463 ///
   2464 /// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
   2465 ///
   2466 /// \param __a
   2467 ///    A 256-bit floating-point vector of [4 x double]. \n
   2468 ///    Bits [63:0] are written to bits [63:0] of the return value. \n
   2469 ///    Bits [191:128] are written to bits [191:128] of the return value.
   2470 /// \param __b
   2471 ///    A 256-bit floating-point vector of [4 x double]. \n
   2472 ///    Bits [63:0] are written to bits [127:64] of the return value. \n
   2473 ///    Bits [191:128] are written to bits [255:192] of the return value. \n
   2474 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
   2475 static __inline __m256d __DEFAULT_FN_ATTRS
   2476 _mm256_unpacklo_pd(__m256d __a, __m256d __b)
   2477 {
   2478   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
   2479 }
   2480 
   2481 /// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
   2482 ///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
   2483 ///    vector of [8 x float].
   2484 ///
   2485 /// \headerfile <x86intrin.h>
   2486 ///
   2487 /// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
   2488 ///
   2489 /// \param __a
   2490 ///    A 256-bit vector of [8 x float]. \n
   2491 ///    Bits [95:64] are written to bits [31:0] of the return value. \n
   2492 ///    Bits [127:96] are written to bits [95:64] of the return value. \n
   2493 ///    Bits [223:192] are written to bits [159:128] of the return value. \n
   2494 ///    Bits [255:224] are written to bits [223:192] of the return value.
   2495 /// \param __b
   2496 ///    A 256-bit vector of [8 x float]. \n
   2497 ///    Bits [95:64] are written to bits [63:32] of the return value. \n
   2498 ///    Bits [127:96] are written to bits [127:96] of the return value. \n
   2499 ///    Bits [223:192] are written to bits [191:160] of the return value. \n
   2500 ///    Bits [255:224] are written to bits [255:224] of the return value.
   2501 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
   2502 static __inline __m256 __DEFAULT_FN_ATTRS
   2503 _mm256_unpackhi_ps(__m256 __a, __m256 __b)
   2504 {
   2505   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
   2506 }
   2507 
   2508 /// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
   2509 ///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
   2510 ///    vector of [8 x float].
   2511 ///
   2512 /// \headerfile <x86intrin.h>
   2513 ///
   2514 /// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
   2515 ///
   2516 /// \param __a
   2517 ///    A 256-bit vector of [8 x float]. \n
   2518 ///    Bits [31:0] are written to bits [31:0] of the return value. \n
   2519 ///    Bits [63:32] are written to bits [95:64] of the return value. \n
   2520 ///    Bits [159:128] are written to bits [159:128] of the return value. \n
   2521 ///    Bits [191:160] are written to bits [223:192] of the return value.
   2522 /// \param __b
   2523 ///    A 256-bit vector of [8 x float]. \n
   2524 ///    Bits [31:0] are written to bits [63:32] of the return value. \n
   2525 ///    Bits [63:32] are written to bits [127:96] of the return value. \n
   2526 ///    Bits [159:128] are written to bits [191:160] of the return value. \n
   2527 ///    Bits [191:160] are written to bits [255:224] of the return value.
   2528 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
   2529 static __inline __m256 __DEFAULT_FN_ATTRS
   2530 _mm256_unpacklo_ps(__m256 __a, __m256 __b)
   2531 {
   2532   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
   2533 }
   2534 
   2535 /* Bit Test */
   2536 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
   2537 ///    element-by-element comparison of the double-precision element in the
   2538 ///    first source vector and the corresponding element in the second source
   2539 ///    vector. The EFLAGS register is updated as follows: \n
   2540 ///    If there is at least one pair of double-precision elements where the
   2541 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2542 ///    ZF flag is set to 1. \n
   2543 ///    If there is at least one pair of double-precision elements where the
   2544 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2545 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2546 ///    This intrinsic returns the value of the ZF flag.
   2547 ///
   2548 /// \headerfile <x86intrin.h>
   2549 ///
   2550 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2551 ///
   2552 /// \param __a
   2553 ///    A 128-bit vector of [2 x double].
   2554 /// \param __b
   2555 ///    A 128-bit vector of [2 x double].
   2556 /// \returns the ZF flag in the EFLAGS register.
   2557 static __inline int __DEFAULT_FN_ATTRS
   2558 _mm_testz_pd(__m128d __a, __m128d __b)
   2559 {
   2560   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
   2561 }
   2562 
   2563 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
   2564 ///    element-by-element comparison of the double-precision element in the
   2565 ///    first source vector and the corresponding element in the second source
   2566 ///    vector. The EFLAGS register is updated as follows: \n
   2567 ///    If there is at least one pair of double-precision elements where the
   2568 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2569 ///    ZF flag is set to 1. \n
   2570 ///    If there is at least one pair of double-precision elements where the
   2571 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2572 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2573 ///    This intrinsic returns the value of the CF flag.
   2574 ///
   2575 /// \headerfile <x86intrin.h>
   2576 ///
   2577 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2578 ///
   2579 /// \param __a
   2580 ///    A 128-bit vector of [2 x double].
   2581 /// \param __b
   2582 ///    A 128-bit vector of [2 x double].
   2583 /// \returns the CF flag in the EFLAGS register.
   2584 static __inline int __DEFAULT_FN_ATTRS
   2585 _mm_testc_pd(__m128d __a, __m128d __b)
   2586 {
   2587   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
   2588 }
   2589 
   2590 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
   2591 ///    element-by-element comparison of the double-precision element in the
   2592 ///    first source vector and the corresponding element in the second source
   2593 ///    vector. The EFLAGS register is updated as follows: \n
   2594 ///    If there is at least one pair of double-precision elements where the
   2595 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2596 ///    ZF flag is set to 1. \n
   2597 ///    If there is at least one pair of double-precision elements where the
   2598 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2599 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2600 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2601 ///    otherwise it returns 0.
   2602 ///
   2603 /// \headerfile <x86intrin.h>
   2604 ///
   2605 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2606 ///
   2607 /// \param __a
   2608 ///    A 128-bit vector of [2 x double].
   2609 /// \param __b
   2610 ///    A 128-bit vector of [2 x double].
   2611 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2612 static __inline int __DEFAULT_FN_ATTRS
   2613 _mm_testnzc_pd(__m128d __a, __m128d __b)
   2614 {
   2615   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
   2616 }
   2617 
   2618 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
   2619 ///    element-by-element comparison of the single-precision element in the
   2620 ///    first source vector and the corresponding element in the second source
   2621 ///    vector. The EFLAGS register is updated as follows: \n
   2622 ///    If there is at least one pair of single-precision elements where the
   2623 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2624 ///    ZF flag is set to 1. \n
   2625 ///    If there is at least one pair of single-precision elements where the
   2626 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2627 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2628 ///    This intrinsic returns the value of the ZF flag.
   2629 ///
   2630 /// \headerfile <x86intrin.h>
   2631 ///
   2632 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2633 ///
   2634 /// \param __a
   2635 ///    A 128-bit vector of [4 x float].
   2636 /// \param __b
   2637 ///    A 128-bit vector of [4 x float].
   2638 /// \returns the ZF flag.
   2639 static __inline int __DEFAULT_FN_ATTRS
   2640 _mm_testz_ps(__m128 __a, __m128 __b)
   2641 {
   2642   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
   2643 }
   2644 
   2645 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
   2646 ///    element-by-element comparison of the single-precision element in the
   2647 ///    first source vector and the corresponding element in the second source
   2648 ///    vector. The EFLAGS register is updated as follows: \n
   2649 ///    If there is at least one pair of single-precision elements where the
   2650 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2651 ///    ZF flag is set to 1. \n
   2652 ///    If there is at least one pair of single-precision elements where the
   2653 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2654 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2655 ///    This intrinsic returns the value of the CF flag.
   2656 ///
   2657 /// \headerfile <x86intrin.h>
   2658 ///
   2659 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2660 ///
   2661 /// \param __a
   2662 ///    A 128-bit vector of [4 x float].
   2663 /// \param __b
   2664 ///    A 128-bit vector of [4 x float].
   2665 /// \returns the CF flag.
   2666 static __inline int __DEFAULT_FN_ATTRS
   2667 _mm_testc_ps(__m128 __a, __m128 __b)
   2668 {
   2669   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
   2670 }
   2671 
   2672 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
   2673 ///    element-by-element comparison of the single-precision element in the
   2674 ///    first source vector and the corresponding element in the second source
   2675 ///    vector. The EFLAGS register is updated as follows: \n
   2676 ///    If there is at least one pair of single-precision elements where the
   2677 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2678 ///    ZF flag is set to 1. \n
   2679 ///    If there is at least one pair of single-precision elements where the
   2680 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2681 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2682 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2683 ///    otherwise it returns 0.
   2684 ///
   2685 /// \headerfile <x86intrin.h>
   2686 ///
   2687 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2688 ///
   2689 /// \param __a
   2690 ///    A 128-bit vector of [4 x float].
   2691 /// \param __b
   2692 ///    A 128-bit vector of [4 x float].
   2693 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2694 static __inline int __DEFAULT_FN_ATTRS
   2695 _mm_testnzc_ps(__m128 __a, __m128 __b)
   2696 {
   2697   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
   2698 }
   2699 
   2700 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
   2701 ///    element-by-element comparison of the double-precision elements in the
   2702 ///    first source vector and the corresponding elements in the second source
   2703 ///    vector. The EFLAGS register is updated as follows: \n
   2704 ///    If there is at least one pair of double-precision elements where the
   2705 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2706 ///    ZF flag is set to 1. \n
   2707 ///    If there is at least one pair of double-precision elements where the
   2708 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2709 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2710 ///    This intrinsic returns the value of the ZF flag.
   2711 ///
   2712 /// \headerfile <x86intrin.h>
   2713 ///
   2714 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2715 ///
   2716 /// \param __a
   2717 ///    A 256-bit vector of [4 x double].
   2718 /// \param __b
   2719 ///    A 256-bit vector of [4 x double].
   2720 /// \returns the ZF flag.
   2721 static __inline int __DEFAULT_FN_ATTRS
   2722 _mm256_testz_pd(__m256d __a, __m256d __b)
   2723 {
   2724   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
   2725 }
   2726 
   2727 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
   2728 ///    element-by-element comparison of the double-precision elements in the
   2729 ///    first source vector and the corresponding elements in the second source
   2730 ///    vector. The EFLAGS register is updated as follows: \n
   2731 ///    If there is at least one pair of double-precision elements where the
   2732 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2733 ///    ZF flag is set to 1. \n
   2734 ///    If there is at least one pair of double-precision elements where the
   2735 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2736 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2737 ///    This intrinsic returns the value of the CF flag.
   2738 ///
   2739 /// \headerfile <x86intrin.h>
   2740 ///
   2741 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2742 ///
   2743 /// \param __a
   2744 ///    A 256-bit vector of [4 x double].
   2745 /// \param __b
   2746 ///    A 256-bit vector of [4 x double].
   2747 /// \returns the CF flag.
   2748 static __inline int __DEFAULT_FN_ATTRS
   2749 _mm256_testc_pd(__m256d __a, __m256d __b)
   2750 {
   2751   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
   2752 }
   2753 
   2754 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
   2755 ///    element-by-element comparison of the double-precision elements in the
   2756 ///    first source vector and the corresponding elements in the second source
   2757 ///    vector. The EFLAGS register is updated as follows: \n
   2758 ///    If there is at least one pair of double-precision elements where the
   2759 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2760 ///    ZF flag is set to 1. \n
   2761 ///    If there is at least one pair of double-precision elements where the
   2762 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2763 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2764 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2765 ///    otherwise it returns 0.
   2766 ///
   2767 /// \headerfile <x86intrin.h>
   2768 ///
   2769 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2770 ///
   2771 /// \param __a
   2772 ///    A 256-bit vector of [4 x double].
   2773 /// \param __b
   2774 ///    A 256-bit vector of [4 x double].
   2775 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2776 static __inline int __DEFAULT_FN_ATTRS
   2777 _mm256_testnzc_pd(__m256d __a, __m256d __b)
   2778 {
   2779   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
   2780 }
   2781 
   2782 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
   2783 ///    element-by-element comparison of the single-precision element in the
   2784 ///    first source vector and the corresponding element in the second source
   2785 ///    vector. The EFLAGS register is updated as follows: \n
   2786 ///    If there is at least one pair of single-precision elements where the
   2787 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2788 ///    ZF flag is set to 1. \n
   2789 ///    If there is at least one pair of single-precision elements where the
   2790 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2791 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2792 ///    This intrinsic returns the value of the ZF flag.
   2793 ///
   2794 /// \headerfile <x86intrin.h>
   2795 ///
   2796 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2797 ///
   2798 /// \param __a
   2799 ///    A 256-bit vector of [8 x float].
   2800 /// \param __b
   2801 ///    A 256-bit vector of [8 x float].
   2802 /// \returns the ZF flag.
   2803 static __inline int __DEFAULT_FN_ATTRS
   2804 _mm256_testz_ps(__m256 __a, __m256 __b)
   2805 {
   2806   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
   2807 }
   2808 
   2809 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
   2810 ///    element-by-element comparison of the single-precision element in the
   2811 ///    first source vector and the corresponding element in the second source
   2812 ///    vector. The EFLAGS register is updated as follows: \n
   2813 ///    If there is at least one pair of single-precision elements where the
   2814 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2815 ///    ZF flag is set to 1. \n
   2816 ///    If there is at least one pair of single-precision elements where the
   2817 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2818 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2819 ///    This intrinsic returns the value of the CF flag.
   2820 ///
   2821 /// \headerfile <x86intrin.h>
   2822 ///
   2823 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2824 ///
   2825 /// \param __a
   2826 ///    A 256-bit vector of [8 x float].
   2827 /// \param __b
   2828 ///    A 256-bit vector of [8 x float].
   2829 /// \returns the CF flag.
   2830 static __inline int __DEFAULT_FN_ATTRS
   2831 _mm256_testc_ps(__m256 __a, __m256 __b)
   2832 {
   2833   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
   2834 }
   2835 
   2836 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
   2837 ///    element-by-element comparison of the single-precision elements in the
   2838 ///    first source vector and the corresponding elements in the second source
   2839 ///    vector. The EFLAGS register is updated as follows: \n
   2840 ///    If there is at least one pair of single-precision elements where the
   2841 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2842 ///    ZF flag is set to 1. \n
   2843 ///    If there is at least one pair of single-precision elements where the
   2844 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2845 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2846 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2847 ///    otherwise it returns 0.
   2848 ///
   2849 /// \headerfile <x86intrin.h>
   2850 ///
   2851 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2852 ///
   2853 /// \param __a
   2854 ///    A 256-bit vector of [8 x float].
   2855 /// \param __b
   2856 ///    A 256-bit vector of [8 x float].
   2857 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2858 static __inline int __DEFAULT_FN_ATTRS
   2859 _mm256_testnzc_ps(__m256 __a, __m256 __b)
   2860 {
   2861   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
   2862 }
   2863 
   2864 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
   2865 ///    of the two source vectors and update the EFLAGS register as follows: \n
   2866 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
   2867 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
   2868 ///    If there is at least one pair of bits where the bit from the first source
   2869 ///    vector is 0 and the bit from the second source vector is 1, the CF flag
   2870 ///    is set to 0. Otherwise the CF flag is set to 1. \n
   2871 ///    This intrinsic returns the value of the ZF flag.
   2872 ///
   2873 /// \headerfile <x86intrin.h>
   2874 ///
   2875 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
   2876 ///
   2877 /// \param __a
   2878 ///    A 256-bit integer vector.
   2879 /// \param __b
   2880 ///    A 256-bit integer vector.
   2881 /// \returns the ZF flag.
   2882 static __inline int __DEFAULT_FN_ATTRS
   2883 _mm256_testz_si256(__m256i __a, __m256i __b)
   2884 {
   2885   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
   2886 }
   2887 
   2888 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
   2889 ///    of the two source vectors and update the EFLAGS register as follows: \n
   2890 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
   2891 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
   2892 ///    If there is at least one pair of bits where the bit from the first source
   2893 ///    vector is 0 and the bit from the second source vector is 1, the CF flag
   2894 ///    is set to 0. Otherwise the CF flag is set to 1. \n
   2895 ///    This intrinsic returns the value of the CF flag.
   2896 ///
   2897 /// \headerfile <x86intrin.h>
   2898 ///
   2899 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
   2900 ///
   2901 /// \param __a
   2902 ///    A 256-bit integer vector.
   2903 /// \param __b
   2904 ///    A 256-bit integer vector.
   2905 /// \returns the CF flag.
   2906 static __inline int __DEFAULT_FN_ATTRS
   2907 _mm256_testc_si256(__m256i __a, __m256i __b)
   2908 {
   2909   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
   2910 }
   2911 
   2912 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
   2913 ///    of the two source vectors and update the EFLAGS register as follows: \n
   2914 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
   2915 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
   2916 ///    If there is at least one pair of bits where the bit from the first source
   2917 ///    vector is 0 and the bit from the second source vector is 1, the CF flag
   2918 ///    is set to 0. Otherwise the CF flag is set to 1. \n
   2919 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2920 ///    otherwise it returns 0.
   2921 ///
   2922 /// \headerfile <x86intrin.h>
   2923 ///
   2924 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
   2925 ///
   2926 /// \param __a
   2927 ///    A 256-bit integer vector.
   2928 /// \param __b
   2929 ///    A 256-bit integer vector.
   2930 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2931 static __inline int __DEFAULT_FN_ATTRS
   2932 _mm256_testnzc_si256(__m256i __a, __m256i __b)
   2933 {
   2934   return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
   2935 }
   2936 
   2937 /* Vector extract sign mask */
   2938 /// \brief Extracts the sign bits of double-precision floating point elements
   2939 ///    in a 256-bit vector of [4 x double] and writes them to the lower order
   2940 ///    bits of the return value.
   2941 ///
   2942 /// \headerfile <x86intrin.h>
   2943 ///
   2944 /// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
   2945 ///
   2946 /// \param __a
   2947 ///    A 256-bit vector of [4 x double] containing the double-precision
   2948 ///    floating point values with sign bits to be extracted.
   2949 /// \returns The sign bits from the operand, written to bits [3:0].
   2950 static __inline int __DEFAULT_FN_ATTRS
   2951 _mm256_movemask_pd(__m256d __a)
   2952 {
   2953   return __builtin_ia32_movmskpd256((__v4df)__a);
   2954 }
   2955 
   2956 /// \brief Extracts the sign bits of double-precision floating point elements
   2957 ///    in a 256-bit vector of [8 x float] and writes them to the lower order
   2958 ///    bits of the return value.
   2959 ///
   2960 /// \headerfile <x86intrin.h>
   2961 ///
   2962 /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
   2963 ///
   2964 /// \param __a
   2965 ///    A 256-bit vector of [8 x float] containing the double-precision floating
   2966 ///    point values with sign bits to be extracted.
   2967 /// \returns The sign bits from the operand, written to bits [7:0].
   2968 static __inline int __DEFAULT_FN_ATTRS
   2969 _mm256_movemask_ps(__m256 __a)
   2970 {
   2971   return __builtin_ia32_movmskps256((__v8sf)__a);
   2972 }
   2973 
   2974 /* Vector __zero */
   2975 /// \brief Zeroes the contents of all XMM or YMM registers.
   2976 ///
   2977 /// \headerfile <x86intrin.h>
   2978 ///
   2979 /// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
   2980 static __inline void __DEFAULT_FN_ATTRS
   2981 _mm256_zeroall(void)
   2982 {
   2983   __builtin_ia32_vzeroall();
   2984 }
   2985 
   2986 /// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
   2987 ///
   2988 /// \headerfile <x86intrin.h>
   2989 ///
   2990 /// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
   2991 static __inline void __DEFAULT_FN_ATTRS
   2992 _mm256_zeroupper(void)
   2993 {
   2994   __builtin_ia32_vzeroupper();
   2995 }
   2996 
   2997 /* Vector load with broadcast */
   2998 /// \brief Loads a scalar single-precision floating point value from the
   2999 ///    specified address pointed to by \a __a and broadcasts it to the elements
   3000 ///    of a [4 x float] vector.
   3001 ///
   3002 /// \headerfile <x86intrin.h>
   3003 ///
   3004 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
   3005 ///
   3006 /// \param __a
   3007 ///    The single-precision floating point value to be broadcast.
   3008 /// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
   3009 ///    equal to the broadcast value.
   3010 static __inline __m128 __DEFAULT_FN_ATTRS
   3011 _mm_broadcast_ss(float const *__a)
   3012 {
   3013   float __f = *__a;
   3014   return (__m128)(__v4sf){ __f, __f, __f, __f };
   3015 }
   3016 
   3017 /// \brief Loads a scalar double-precision floating point value from the
   3018 ///    specified address pointed to by \a __a and broadcasts it to the elements
   3019 ///    of a [4 x double] vector.
   3020 ///
   3021 /// \headerfile <x86intrin.h>
   3022 ///
   3023 /// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
   3024 ///
   3025 /// \param __a
   3026 ///    The double-precision floating point value to be broadcast.
   3027 /// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
   3028 ///    equal to the broadcast value.
   3029 static __inline __m256d __DEFAULT_FN_ATTRS
   3030 _mm256_broadcast_sd(double const *__a)
   3031 {
   3032   double __d = *__a;
   3033   return (__m256d)(__v4df){ __d, __d, __d, __d };
   3034 }
   3035 
   3036 /// \brief Loads a scalar single-precision floating point value from the
   3037 ///    specified address pointed to by \a __a and broadcasts it to the elements
   3038 ///    of a [8 x float] vector.
   3039 ///
   3040 /// \headerfile <x86intrin.h>
   3041 ///
   3042 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
   3043 ///
   3044 /// \param __a
   3045 ///    The single-precision floating point value to be broadcast.
   3046 /// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
   3047 ///    equal to the broadcast value.
   3048 static __inline __m256 __DEFAULT_FN_ATTRS
   3049 _mm256_broadcast_ss(float const *__a)
   3050 {
   3051   float __f = *__a;
   3052   return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
   3053 }
   3054 
   3055 /// \brief Loads the data from a 128-bit vector of [2 x double] from the
   3056 ///    specified address pointed to by \a __a and broadcasts it to 128-bit
   3057 ///    elements in a 256-bit vector of [4 x double].
   3058 ///
   3059 /// \headerfile <x86intrin.h>
   3060 ///
   3061 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
   3062 ///
   3063 /// \param __a
   3064 ///    The 128-bit vector of [2 x double] to be broadcast.
   3065 /// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
   3066 ///    equal to the broadcast value.
   3067 static __inline __m256d __DEFAULT_FN_ATTRS
   3068 _mm256_broadcast_pd(__m128d const *__a)
   3069 {
   3070   return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
   3071 }
   3072 
   3073 /// \brief Loads the data from a 128-bit vector of [4 x float] from the
   3074 ///    specified address pointed to by \a __a and broadcasts it to 128-bit
   3075 ///    elements in a 256-bit vector of [8 x float].
   3076 ///
   3077 /// \headerfile <x86intrin.h>
   3078 ///
   3079 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
   3080 ///
   3081 /// \param __a
   3082 ///    The 128-bit vector of [4 x float] to be broadcast.
   3083 /// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
   3084 ///    equal to the broadcast value.
   3085 static __inline __m256 __DEFAULT_FN_ATTRS
   3086 _mm256_broadcast_ps(__m128 const *__a)
   3087 {
   3088   return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
   3089 }
   3090 
   3091 /* SIMD load ops */
   3092 /// \brief Loads 4 double-precision floating point values from a 32-byte aligned
   3093 ///    memory location pointed to by \a __p into a vector of [4 x double].
   3094 ///
   3095 /// \headerfile <x86intrin.h>
   3096 ///
   3097 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
   3098 ///
   3099 /// \param __p
   3100 ///    A 32-byte aligned pointer to a memory location containing
   3101 ///    double-precision floating point values.
   3102 /// \returns A 256-bit vector of [4 x double] containing the moved values.
   3103 static __inline __m256d __DEFAULT_FN_ATTRS
   3104 _mm256_load_pd(double const *__p)
   3105 {
   3106   return *(__m256d *)__p;
   3107 }
   3108 
   3109 /// \brief Loads 8 single-precision floating point values from a 32-byte aligned
   3110 ///    memory location pointed to by \a __p into a vector of [8 x float].
   3111 ///
   3112 /// \headerfile <x86intrin.h>
   3113 ///
   3114 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
   3115 ///
   3116 /// \param __p
   3117 ///    A 32-byte aligned pointer to a memory location containing float values.
   3118 /// \returns A 256-bit vector of [8 x float] containing the moved values.
   3119 static __inline __m256 __DEFAULT_FN_ATTRS
   3120 _mm256_load_ps(float const *__p)
   3121 {
   3122   return *(__m256 *)__p;
   3123 }
   3124 
   3125 /// \brief Loads 4 double-precision floating point values from an unaligned
   3126 ///    memory location pointed to by \a __p into a vector of [4 x double].
   3127 ///
   3128 /// \headerfile <x86intrin.h>
   3129 ///
   3130 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
   3131 ///
   3132 /// \param __p
   3133 ///    A pointer to a memory location containing double-precision floating
   3134 ///    point values.
   3135 /// \returns A 256-bit vector of [4 x double] containing the moved values.
   3136 static __inline __m256d __DEFAULT_FN_ATTRS
   3137 _mm256_loadu_pd(double const *__p)
   3138 {
   3139   struct __loadu_pd {
   3140     __m256d __v;
   3141   } __attribute__((__packed__, __may_alias__));
   3142   return ((struct __loadu_pd*)__p)->__v;
   3143 }
   3144 
   3145 /// \brief Loads 8 single-precision floating point values from an unaligned
   3146 ///    memory location pointed to by \a __p into a vector of [8 x float].
   3147 ///
   3148 /// \headerfile <x86intrin.h>
   3149 ///
   3150 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
   3151 ///
   3152 /// \param __p
   3153 ///    A pointer to a memory location containing single-precision floating
   3154 ///    point values.
   3155 /// \returns A 256-bit vector of [8 x float] containing the moved values.
   3156 static __inline __m256 __DEFAULT_FN_ATTRS
   3157 _mm256_loadu_ps(float const *__p)
   3158 {
   3159   struct __loadu_ps {
   3160     __m256 __v;
   3161   } __attribute__((__packed__, __may_alias__));
   3162   return ((struct __loadu_ps*)__p)->__v;
   3163 }
   3164 
   3165 /// \brief Loads 256 bits of integer data from a 32-byte aligned memory
   3166 ///    location pointed to by \a __p into elements of a 256-bit integer vector.
   3167 ///
   3168 /// \headerfile <x86intrin.h>
   3169 ///
   3170 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
   3171 ///
   3172 /// \param __p
   3173 ///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
   3174 ///    values.
   3175 /// \returns A 256-bit integer vector containing the moved values.
   3176 static __inline __m256i __DEFAULT_FN_ATTRS
   3177 _mm256_load_si256(__m256i const *__p)
   3178 {
   3179   return *__p;
   3180 }
   3181 
   3182 /// \brief Loads 256 bits of integer data from an unaligned memory location
   3183 ///    pointed to by \a __p into a 256-bit integer vector.
   3184 ///
   3185 /// \headerfile <x86intrin.h>
   3186 ///
   3187 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
   3188 ///
   3189 /// \param __p
   3190 ///    A pointer to a 256-bit integer vector containing integer values.
   3191 /// \returns A 256-bit integer vector containing the moved values.
   3192 static __inline __m256i __DEFAULT_FN_ATTRS
   3193 _mm256_loadu_si256(__m256i const *__p)
   3194 {
   3195   struct __loadu_si256 {
   3196     __m256i __v;
   3197   } __attribute__((__packed__, __may_alias__));
   3198   return ((struct __loadu_si256*)__p)->__v;
   3199 }
   3200 
   3201 /// \brief Loads 256 bits of integer data from an unaligned memory location
   3202 ///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
   3203 ///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
   3204 ///    line boundary.
   3205 ///
   3206 /// \headerfile <x86intrin.h>
   3207 ///
   3208 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
   3209 ///
   3210 /// \param __p
   3211 ///    A pointer to a 256-bit integer vector containing integer values.
   3212 /// \returns A 256-bit integer vector containing the moved values.
   3213 static __inline __m256i __DEFAULT_FN_ATTRS
   3214 _mm256_lddqu_si256(__m256i const *__p)
   3215 {
   3216   return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
   3217 }
   3218 
   3219 /* SIMD store ops */
   3220 /// \brief Stores double-precision floating point values from a 256-bit vector
   3221 ///    of [4 x double] to a 32-byte aligned memory location pointed to by
   3222 ///    \a __p.
   3223 ///
   3224 /// \headerfile <x86intrin.h>
   3225 ///
   3226 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
   3227 ///
   3228 /// \param __p
   3229 ///    A 32-byte aligned pointer to a memory location that will receive the
   3230 ///    double-precision floaing point values.
   3231 /// \param __a
   3232 ///    A 256-bit vector of [4 x double] containing the values to be moved.
   3233 static __inline void __DEFAULT_FN_ATTRS
   3234 _mm256_store_pd(double *__p, __m256d __a)
   3235 {
   3236   *(__m256d *)__p = __a;
   3237 }
   3238 
   3239 /// \brief Stores single-precision floating point values from a 256-bit vector
   3240 ///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
   3241 ///
   3242 /// \headerfile <x86intrin.h>
   3243 ///
   3244 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
   3245 ///
   3246 /// \param __p
   3247 ///    A 32-byte aligned pointer to a memory location that will receive the
   3248 ///    float values.
   3249 /// \param __a
   3250 ///    A 256-bit vector of [8 x float] containing the values to be moved.
   3251 static __inline void __DEFAULT_FN_ATTRS
   3252 _mm256_store_ps(float *__p, __m256 __a)
   3253 {
   3254   *(__m256 *)__p = __a;
   3255 }
   3256 
   3257 /// \brief Stores double-precision floating point values from a 256-bit vector
   3258 ///    of [4 x double] to an unaligned memory location pointed to by \a __p.
   3259 ///
   3260 /// \headerfile <x86intrin.h>
   3261 ///
   3262 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
   3263 ///
   3264 /// \param __p
   3265 ///    A pointer to a memory location that will receive the double-precision
   3266 ///    floating point values.
   3267 /// \param __a
   3268 ///    A 256-bit vector of [4 x double] containing the values to be moved.
   3269 static __inline void __DEFAULT_FN_ATTRS
   3270 _mm256_storeu_pd(double *__p, __m256d __a)
   3271 {
   3272   struct __storeu_pd {
   3273     __m256d __v;
   3274   } __attribute__((__packed__, __may_alias__));
   3275   ((struct __storeu_pd*)__p)->__v = __a;
   3276 }
   3277 
   3278 /// \brief Stores single-precision floating point values from a 256-bit vector
   3279 ///    of [8 x float] to an unaligned memory location pointed to by \a __p.
   3280 ///
   3281 /// \headerfile <x86intrin.h>
   3282 ///
   3283 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
   3284 ///
   3285 /// \param __p
   3286 ///    A pointer to a memory location that will receive the float values.
   3287 /// \param __a
   3288 ///    A 256-bit vector of [8 x float] containing the values to be moved.
   3289 static __inline void __DEFAULT_FN_ATTRS
   3290 _mm256_storeu_ps(float *__p, __m256 __a)
   3291 {
   3292   struct __storeu_ps {
   3293     __m256 __v;
   3294   } __attribute__((__packed__, __may_alias__));
   3295   ((struct __storeu_ps*)__p)->__v = __a;
   3296 }
   3297 
   3298 /// \brief Stores integer values from a 256-bit integer vector to a 32-byte
   3299 ///    aligned memory location pointed to by \a __p.
   3300 ///
   3301 /// \headerfile <x86intrin.h>
   3302 ///
   3303 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
   3304 ///
   3305 /// \param __p
   3306 ///    A 32-byte aligned pointer to a memory location that will receive the
   3307 ///    integer values.
   3308 /// \param __a
   3309 ///    A 256-bit integer vector containing the values to be moved.
   3310 static __inline void __DEFAULT_FN_ATTRS
   3311 _mm256_store_si256(__m256i *__p, __m256i __a)
   3312 {
   3313   *__p = __a;
   3314 }
   3315 
   3316 /// \brief Stores integer values from a 256-bit integer vector to an unaligned
   3317 ///    memory location pointed to by \a __p.
   3318 ///
   3319 /// \headerfile <x86intrin.h>
   3320 ///
   3321 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
   3322 ///
   3323 /// \param __p
   3324 ///    A pointer to a memory location that will receive the integer values.
   3325 /// \param __a
   3326 ///    A 256-bit integer vector containing the values to be moved.
   3327 static __inline void __DEFAULT_FN_ATTRS
   3328 _mm256_storeu_si256(__m256i *__p, __m256i __a)
   3329 {
   3330   struct __storeu_si256 {
   3331     __m256i __v;
   3332   } __attribute__((__packed__, __may_alias__));
   3333   ((struct __storeu_si256*)__p)->__v = __a;
   3334 }
   3335 
   3336 /* Conditional load ops */
   3337 /// \brief Conditionally loads double-precision floating point elements from a
   3338 ///    memory location pointed to by \a __p into a 128-bit vector of
   3339 ///    [2 x double], depending on the mask bits associated with each data
   3340 ///    element.
   3341 ///
   3342 /// \headerfile <x86intrin.h>
   3343 ///
   3344 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3345 ///
   3346 /// \param __p
   3347 ///    A pointer to a memory location that contains the double-precision
   3348 ///    floating point values.
   3349 /// \param __m
   3350 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3351 ///    each data element represents the mask bits. If a mask bit is zero, the
   3352 ///    corresponding value in the memory location is not loaded and the
   3353 ///    corresponding field in the return value is set to zero.
   3354 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
   3355 static __inline __m128d __DEFAULT_FN_ATTRS
   3356 _mm_maskload_pd(double const *__p, __m128i __m)
   3357 {
   3358   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
   3359 }
   3360 
   3361 /// \brief Conditionally loads double-precision floating point elements from a
   3362 ///    memory location pointed to by \a __p into a 256-bit vector of
   3363 ///    [4 x double], depending on the mask bits associated with each data
   3364 ///    element.
   3365 ///
   3366 /// \headerfile <x86intrin.h>
   3367 ///
   3368 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3369 ///
   3370 /// \param __p
   3371 ///    A pointer to a memory location that contains the double-precision
   3372 ///    floating point values.
   3373 /// \param __m
   3374 ///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
   3375 ///    significant bit of each quadword element represents the mask bits. If a
   3376 ///    mask bit is zero, the corresponding value in the memory location is not
   3377 ///    loaded and the corresponding field in the return value is set to zero.
   3378 /// \returns A 256-bit vector of [4 x double] containing the loaded values.
   3379 static __inline __m256d __DEFAULT_FN_ATTRS
   3380 _mm256_maskload_pd(double const *__p, __m256i __m)
   3381 {
   3382   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
   3383                                                (__v4di)__m);
   3384 }
   3385 
   3386 /// \brief Conditionally loads single-precision floating point elements from a
   3387 ///    memory location pointed to by \a __p into a 128-bit vector of
   3388 ///    [4 x float], depending on the mask bits associated with each data
   3389 ///    element.
   3390 ///
   3391 /// \headerfile <x86intrin.h>
   3392 ///
   3393 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3394 ///
   3395 /// \param __p
   3396 ///    A pointer to a memory location that contains the single-precision
   3397 ///    floating point values.
   3398 /// \param __m
   3399 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3400 ///    each data element represents the mask bits. If a mask bit is zero, the
   3401 ///    corresponding value in the memory location is not loaded and the
   3402 ///    corresponding field in the return value is set to zero.
   3403 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
   3404 static __inline __m128 __DEFAULT_FN_ATTRS
   3405 _mm_maskload_ps(float const *__p, __m128i __m)
   3406 {
   3407   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
   3408 }
   3409 
   3410 /// \brief Conditionally loads single-precision floating point elements from a
   3411 ///    memory location pointed to by \a __p into a 256-bit vector of
   3412 ///    [8 x float], depending on the mask bits associated with each data
   3413 ///    element.
   3414 ///
   3415 /// \headerfile <x86intrin.h>
   3416 ///
   3417 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3418 ///
   3419 /// \param __p
   3420 ///    A pointer to a memory location that contains the single-precision
   3421 ///    floating point values.
   3422 /// \param __m
   3423 ///    A 256-bit integer vector of [8 x dword] containing the mask. The most
   3424 ///    significant bit of each dword element represents the mask bits. If a mask
   3425 ///    bit is zero, the corresponding value in the memory location is not loaded
   3426 ///    and the corresponding field in the return value is set to zero.
   3427 /// \returns A 256-bit vector of [8 x float] containing the loaded values.
   3428 static __inline __m256 __DEFAULT_FN_ATTRS
   3429 _mm256_maskload_ps(float const *__p, __m256i __m)
   3430 {
   3431   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
   3432 }
   3433 
   3434 /* Conditional store ops */
   3435 /// \brief Moves single-precision floating point values from a 256-bit vector
   3436 ///    of [8 x float] to a memory location pointed to by \a __p, according to
   3437 ///    the specified mask.
   3438 ///
   3439 /// \headerfile <x86intrin.h>
   3440 ///
   3441 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3442 ///
   3443 /// \param __p
   3444 ///    A pointer to a memory location that will receive the float values.
   3445 /// \param __m
   3446 ///    A 256-bit integer vector of [8 x dword] containing the mask. The most
   3447 ///    significant bit of each dword element in the mask vector represents the
   3448 ///    mask bits. If a mask bit is zero, the corresponding value from vector
   3449 ///    \a __a is not stored and the corresponding field in the memory location
   3450 ///    pointed to by \a __p is not changed.
   3451 /// \param __a
   3452 ///    A 256-bit vector of [8 x float] containing the values to be stored.
   3453 static __inline void __DEFAULT_FN_ATTRS
   3454 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
   3455 {
   3456   __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
   3457 }
   3458 
   3459 /// \brief Moves double-precision values from a 128-bit vector of [2 x double]
   3460 ///    to a memory location pointed to by \a __p, according to the specified
   3461 ///    mask.
   3462 ///
   3463 /// \headerfile <x86intrin.h>
   3464 ///
   3465 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3466 ///
   3467 /// \param __p
   3468 ///    A pointer to a memory location that will receive the float values.
   3469 /// \param __m
   3470 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3471 ///    each field in the mask vector represents the mask bits. If a mask bit is
   3472 ///    zero, the corresponding value from vector \a __a is not stored and the
   3473 ///    corresponding field in the memory location pointed to by \a __p is not
   3474 ///    changed.
   3475 /// \param __a
   3476 ///    A 128-bit vector of [2 x double] containing the values to be stored.
   3477 static __inline void __DEFAULT_FN_ATTRS
   3478 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
   3479 {
   3480   __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
   3481 }
   3482 
   3483 /// \brief Moves double-precision values from a 256-bit vector of [4 x double]
   3484 ///    to a memory location pointed to by \a __p, according to the specified
   3485 ///    mask.
   3486 ///
   3487 /// \headerfile <x86intrin.h>
   3488 ///
   3489 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3490 ///
   3491 /// \param __p
   3492 ///    A pointer to a memory location that will receive the float values.
   3493 /// \param __m
   3494 ///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
   3495 ///    significant bit of each quadword element in the mask vector represents
   3496 ///    the mask bits. If a mask bit is zero, the corresponding value from vector
   3497 ///    __a is not stored and the corresponding field in the memory location
   3498 ///    pointed to by \a __p is not changed.
   3499 /// \param __a
   3500 ///    A 256-bit vector of [4 x double] containing the values to be stored.
   3501 static __inline void __DEFAULT_FN_ATTRS
   3502 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
   3503 {
   3504   __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
   3505 }
   3506 
   3507 /// \brief Moves single-precision floating point values from a 128-bit vector
   3508 ///    of [4 x float] to a memory location pointed to by \a __p, according to
   3509 ///    the specified mask.
   3510 ///
   3511 /// \headerfile <x86intrin.h>
   3512 ///
   3513 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3514 ///
   3515 /// \param __p
   3516 ///    A pointer to a memory location that will receive the float values.
   3517 /// \param __m
   3518 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3519 ///    each field in the mask vector represents the mask bits. If a mask bit is
   3520 ///    zero, the corresponding value from vector __a is not stored and the
   3521 ///    corresponding field in the memory location pointed to by \a __p is not
   3522 ///    changed.
   3523 /// \param __a
   3524 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   3525 static __inline void __DEFAULT_FN_ATTRS
   3526 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
   3527 {
   3528   __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
   3529 }
   3530 
   3531 /* Cacheability support ops */
   3532 /// \brief Moves integer data from a 256-bit integer vector to a 32-byte
   3533 ///    aligned memory location. To minimize caching, the data is flagged as
   3534 ///    non-temporal (unlikely to be used again soon).
   3535 ///
   3536 /// \headerfile <x86intrin.h>
   3537 ///
   3538 /// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
   3539 ///
   3540 /// \param __a
   3541 ///    A pointer to a 32-byte aligned memory location that will receive the
   3542 ///    integer values.
   3543 /// \param __b
   3544 ///    A 256-bit integer vector containing the values to be moved.
   3545 static __inline void __DEFAULT_FN_ATTRS
   3546 _mm256_stream_si256(__m256i *__a, __m256i __b)
   3547 {
   3548   __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
   3549 }
   3550 
   3551 /// \brief Moves double-precision values from a 256-bit vector of [4 x double]
   3552 ///    to a 32-byte aligned memory location. To minimize caching, the data is
   3553 ///    flagged as non-temporal (unlikely to be used again soon).
   3554 ///
   3555 /// \headerfile <x86intrin.h>
   3556 ///
   3557 /// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
   3558 ///
   3559 /// \param __a
   3560 ///    A pointer to a 32-byte aligned memory location that will receive the
   3561 ///    integer values.
   3562 /// \param __b
   3563 ///    A 256-bit vector of [4 x double] containing the values to be moved.
   3564 static __inline void __DEFAULT_FN_ATTRS
   3565 _mm256_stream_pd(double *__a, __m256d __b)
   3566 {
   3567   __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
   3568 }
   3569 
   3570 /// \brief Moves single-precision floating point values from a 256-bit vector
   3571 ///    of [8 x float] to a 32-byte aligned memory location. To minimize
   3572 ///    caching, the data is flagged as non-temporal (unlikely to be used again
   3573 ///    soon).
   3574 ///
   3575 /// \headerfile <x86intrin.h>
   3576 ///
   3577 /// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
   3578 ///
   3579 /// \param __p
   3580 ///    A pointer to a 32-byte aligned memory location that will receive the
   3581 ///    single-precision floating point values.
   3582 /// \param __a
   3583 ///    A 256-bit vector of [8 x float] containing the values to be moved.
   3584 static __inline void __DEFAULT_FN_ATTRS
   3585 _mm256_stream_ps(float *__p, __m256 __a)
   3586 {
   3587   __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
   3588 }
   3589 
   3590 /* Create vectors */
   3591 /// \brief Create a 256-bit vector of [4 x double] with undefined values.
   3592 ///
   3593 /// \headerfile <x86intrin.h>
   3594 ///
   3595 /// This intrinsic has no corresponding instruction.
   3596 ///
   3597 /// \returns A 256-bit vector of [4 x double] containing undefined values.
   3598 static __inline__ __m256d __DEFAULT_FN_ATTRS
   3599 _mm256_undefined_pd(void)
   3600 {
   3601   return (__m256d)__builtin_ia32_undef256();
   3602 }
   3603 
   3604 /// \brief Create a 256-bit vector of [8 x float] with undefined values.
   3605 ///
   3606 /// \headerfile <x86intrin.h>
   3607 ///
   3608 /// This intrinsic has no corresponding instruction.
   3609 ///
   3610 /// \returns A 256-bit vector of [8 x float] containing undefined values.
   3611 static __inline__ __m256 __DEFAULT_FN_ATTRS
   3612 _mm256_undefined_ps(void)
   3613 {
   3614   return (__m256)__builtin_ia32_undef256();
   3615 }
   3616 
   3617 /// \brief Create a 256-bit integer vector with undefined values.
   3618 ///
   3619 /// \headerfile <x86intrin.h>
   3620 ///
   3621 /// This intrinsic has no corresponding instruction.
   3622 ///
   3623 /// \returns A 256-bit integer vector containing undefined values.
   3624 static __inline__ __m256i __DEFAULT_FN_ATTRS
   3625 _mm256_undefined_si256(void)
   3626 {
   3627   return (__m256i)__builtin_ia32_undef256();
   3628 }
   3629 
   3630 /// \brief Constructs a 256-bit floating-point vector of [4 x double]
   3631 ///    initialized with the specified double-precision floating-point values.
   3632 ///
   3633 /// \headerfile <x86intrin.h>
   3634 ///
   3635 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
   3636 ///   instruction.
   3637 ///
   3638 /// \param __a
   3639 ///    A double-precision floating-point value used to initialize bits [255:192]
   3640 ///    of the result.
   3641 /// \param __b
   3642 ///    A double-precision floating-point value used to initialize bits [191:128]
   3643 ///    of the result.
   3644 /// \param __c
   3645 ///    A double-precision floating-point value used to initialize bits [127:64]
   3646 ///    of the result.
   3647 /// \param __d
   3648 ///    A double-precision floating-point value used to initialize bits [63:0]
   3649 ///    of the result.
   3650 /// \returns An initialized 256-bit floating-point vector of [4 x double].
   3651 static __inline __m256d __DEFAULT_FN_ATTRS
   3652 _mm256_set_pd(double __a, double __b, double __c, double __d)
   3653 {
   3654   return (__m256d){ __d, __c, __b, __a };
   3655 }
   3656 
   3657 /// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
   3658 ///    with the specified single-precision floating-point values.
   3659 ///
   3660 /// \headerfile <x86intrin.h>
   3661 ///
   3662 /// This intrinsic is a utility function and does not correspond to a specific
   3663 ///   instruction.
   3664 ///
   3665 /// \param __a
   3666 ///    A single-precision floating-point value used to initialize bits [255:224]
   3667 ///    of the result.
   3668 /// \param __b
   3669 ///    A single-precision floating-point value used to initialize bits [223:192]
   3670 ///    of the result.
   3671 /// \param __c
   3672 ///    A single-precision floating-point value used to initialize bits [191:160]
   3673 ///    of the result.
   3674 /// \param __d
   3675 ///    A single-precision floating-point value used to initialize bits [159:128]
   3676 ///    of the result.
   3677 /// \param __e
   3678 ///    A single-precision floating-point value used to initialize bits [127:96]
   3679 ///    of the result.
   3680 /// \param __f
   3681 ///    A single-precision floating-point value used to initialize bits [95:64]
   3682 ///    of the result.
   3683 /// \param __g
   3684 ///    A single-precision floating-point value used to initialize bits [63:32]
   3685 ///    of the result.
   3686 /// \param __h
   3687 ///    A single-precision floating-point value used to initialize bits [31:0]
   3688 ///    of the result.
   3689 /// \returns An initialized 256-bit floating-point vector of [8 x float].
   3690 static __inline __m256 __DEFAULT_FN_ATTRS
   3691 _mm256_set_ps(float __a, float __b, float __c, float __d,
   3692               float __e, float __f, float __g, float __h)
   3693 {
   3694   return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
   3695 }
   3696 
   3697 /// \brief Constructs a 256-bit integer vector initialized with the specified
   3698 ///    32-bit integral values.
   3699 ///
   3700 /// \headerfile <x86intrin.h>
   3701 ///
   3702 /// This intrinsic is a utility function and does not correspond to a specific
   3703 ///   instruction.
   3704 ///
   3705 /// \param __i0
   3706 ///    A 32-bit integral value used to initialize bits [255:224] of the result.
   3707 /// \param __i1
   3708 ///    A 32-bit integral value used to initialize bits [223:192] of the result.
   3709 /// \param __i2
   3710 ///    A 32-bit integral value used to initialize bits [191:160] of the result.
   3711 /// \param __i3
   3712 ///    A 32-bit integral value used to initialize bits [159:128] of the result.
   3713 /// \param __i4
   3714 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
   3715 /// \param __i5
   3716 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
   3717 /// \param __i6
   3718 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
   3719 /// \param __i7
   3720 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
   3721 /// \returns An initialized 256-bit integer vector.
   3722 static __inline __m256i __DEFAULT_FN_ATTRS
   3723 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
   3724                  int __i4, int __i5, int __i6, int __i7)
   3725 {
   3726   return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
   3727 }
   3728 
   3729 /// \brief Constructs a 256-bit integer vector initialized with the specified
   3730 ///    16-bit integral values.
   3731 ///
   3732 /// \headerfile <x86intrin.h>
   3733 ///
   3734 /// This intrinsic is a utility function and does not correspond to a specific
   3735 ///   instruction.
   3736 ///
   3737 /// \param __w15
   3738 ///    A 16-bit integral value used to initialize bits [255:240] of the result.
   3739 /// \param __w14
   3740 ///    A 16-bit integral value used to initialize bits [239:224] of the result.
   3741 /// \param __w13
   3742 ///    A 16-bit integral value used to initialize bits [223:208] of the result.
   3743 /// \param __w12
   3744 ///    A 16-bit integral value used to initialize bits [207:192] of the result.
   3745 /// \param __w11
   3746 ///    A 16-bit integral value used to initialize bits [191:176] of the result.
   3747 /// \param __w10
   3748 ///    A 16-bit integral value used to initialize bits [175:160] of the result.
   3749 /// \param __w09
   3750 ///    A 16-bit integral value used to initialize bits [159:144] of the result.
   3751 /// \param __w08
   3752 ///    A 16-bit integral value used to initialize bits [143:128] of the result.
   3753 /// \param __w07
   3754 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
   3755 /// \param __w06
   3756 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
   3757 /// \param __w05
   3758 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
   3759 /// \param __w04
   3760 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
   3761 /// \param __w03
   3762 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
   3763 /// \param __w02
   3764 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
   3765 /// \param __w01
   3766 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
   3767 /// \param __w00
   3768 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
   3769 /// \returns An initialized 256-bit integer vector.
   3770 static __inline __m256i __DEFAULT_FN_ATTRS
   3771 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
   3772                  short __w11, short __w10, short __w09, short __w08,
   3773                  short __w07, short __w06, short __w05, short __w04,
   3774                  short __w03, short __w02, short __w01, short __w00)
   3775 {
   3776   return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
   3777     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
   3778 }
   3779 
   3780 /// \brief Constructs a 256-bit integer vector initialized with the specified
   3781 ///    8-bit integral values.
   3782 ///
   3783 /// \headerfile <x86intrin.h>
   3784 ///
   3785 /// This intrinsic is a utility function and does not correspond to a specific
   3786 ///   instruction.
   3787 ///
   3788 /// \param __b31
   3789 ///    An 8-bit integral value used to initialize bits [255:248] of the result.
   3790 /// \param __b30
   3791 ///    An 8-bit integral value used to initialize bits [247:240] of the result.
   3792 /// \param __b29
   3793 ///    An 8-bit integral value used to initialize bits [239:232] of the result.
   3794 /// \param __b28
   3795 ///    An 8-bit integral value used to initialize bits [231:224] of the result.
   3796 /// \param __b27
   3797 ///    An 8-bit integral value used to initialize bits [223:216] of the result.
   3798 /// \param __b26
   3799 ///    An 8-bit integral value used to initialize bits [215:208] of the result.
   3800 /// \param __b25
   3801 ///    An 8-bit integral value used to initialize bits [207:200] of the result.
   3802 /// \param __b24
   3803 ///    An 8-bit integral value used to initialize bits [199:192] of the result.
   3804 /// \param __b23
   3805 ///    An 8-bit integral value used to initialize bits [191:184] of the result.
   3806 /// \param __b22
   3807 ///    An 8-bit integral value used to initialize bits [183:176] of the result.
   3808 /// \param __b21
   3809 ///    An 8-bit integral value used to initialize bits [175:168] of the result.
   3810 /// \param __b20
   3811 ///    An 8-bit integral value used to initialize bits [167:160] of the result.
   3812 /// \param __b19
   3813 ///    An 8-bit integral value used to initialize bits [159:152] of the result.
   3814 /// \param __b18
   3815 ///    An 8-bit integral value used to initialize bits [151:144] of the result.
   3816 /// \param __b17
   3817 ///    An 8-bit integral value used to initialize bits [143:136] of the result.
   3818 /// \param __b16
   3819 ///    An 8-bit integral value used to initialize bits [135:128] of the result.
   3820 /// \param __b15
   3821 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
   3822 /// \param __b14
   3823 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
   3824 /// \param __b13
   3825 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
   3826 /// \param __b12
   3827 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
   3828 /// \param __b11
   3829 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
   3830 /// \param __b10
   3831 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
   3832 /// \param __b09
   3833 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
   3834 /// \param __b08
   3835 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
   3836 /// \param __b07
   3837 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
   3838 /// \param __b06
   3839 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
   3840 /// \param __b05
   3841 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
   3842 /// \param __b04
   3843 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
   3844 /// \param __b03
   3845 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
   3846 /// \param __b02
   3847 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
   3848 /// \param __b01
   3849 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
   3850 /// \param __b00
   3851 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
   3852 /// \returns An initialized 256-bit integer vector.
   3853 static __inline __m256i __DEFAULT_FN_ATTRS
   3854 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
   3855                 char __b27, char __b26, char __b25, char __b24,
   3856                 char __b23, char __b22, char __b21, char __b20,
   3857                 char __b19, char __b18, char __b17, char __b16,
   3858                 char __b15, char __b14, char __b13, char __b12,
   3859                 char __b11, char __b10, char __b09, char __b08,
   3860                 char __b07, char __b06, char __b05, char __b04,
   3861                 char __b03, char __b02, char __b01, char __b00)
   3862 {
   3863   return (__m256i)(__v32qi){
   3864     __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
   3865     __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
   3866     __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
   3867     __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
   3868   };
   3869 }
   3870 
   3871 /// \brief Constructs a 256-bit integer vector initialized with the specified
   3872 ///    64-bit integral values.
   3873 ///
   3874 /// \headerfile <x86intrin.h>
   3875 ///
   3876 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
   3877 ///   instruction.
   3878 ///
   3879 /// \param __a
   3880 ///    A 64-bit integral value used to initialize bits [255:192] of the result.
   3881 /// \param __b
   3882 ///    A 64-bit integral value used to initialize bits [191:128] of the result.
   3883 /// \param __c
   3884 ///    A 64-bit integral value used to initialize bits [127:64] of the result.
   3885 /// \param __d
   3886 ///    A 64-bit integral value used to initialize bits [63:0] of the result.
   3887 /// \returns An initialized 256-bit integer vector.
   3888 static __inline __m256i __DEFAULT_FN_ATTRS
   3889 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
   3890 {
   3891   return (__m256i)(__v4di){ __d, __c, __b, __a };
   3892 }
   3893 
   3894 /* Create vectors with elements in reverse order */
   3895 /// \brief Constructs a 256-bit floating-point vector of [4 x double],
   3896 ///    initialized in reverse order with the specified double-precision
   3897 ///    floating-point values.
   3898 ///
   3899 /// \headerfile <x86intrin.h>
   3900 ///
   3901 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
   3902 ///   instruction.
   3903 ///
   3904 /// \param __a
   3905 ///    A double-precision floating-point value used to initialize bits [63:0]
   3906 ///    of the result.
   3907 /// \param __b
   3908 ///    A double-precision floating-point value used to initialize bits [127:64]
   3909 ///    of the result.
   3910 /// \param __c
   3911 ///    A double-precision floating-point value used to initialize bits [191:128]
   3912 ///    of the result.
   3913 /// \param __d
   3914 ///    A double-precision floating-point value used to initialize bits [255:192]
   3915 ///    of the result.
   3916 /// \returns An initialized 256-bit floating-point vector of [4 x double].
   3917 static __inline __m256d __DEFAULT_FN_ATTRS
   3918 _mm256_setr_pd(double __a, double __b, double __c, double __d)
   3919 {
   3920   return (__m256d){ __a, __b, __c, __d };
   3921 }
   3922 
   3923 /// \brief Constructs a 256-bit floating-point vector of [8 x float],
   3924 ///    initialized in reverse order with the specified single-precision
   3925 ///    float-point values.
   3926 ///
   3927 /// \headerfile <x86intrin.h>
   3928 ///
   3929 /// This intrinsic is a utility function and does not correspond to a specific
   3930 ///   instruction.
   3931 ///
   3932 /// \param __a
   3933 ///    A single-precision floating-point value used to initialize bits [31:0]
   3934 ///    of the result.
   3935 /// \param __b
   3936 ///    A single-precision floating-point value used to initialize bits [63:32]
   3937 ///    of the result.
   3938 /// \param __c
   3939 ///    A single-precision floating-point value used to initialize bits [95:64]
   3940 ///    of the result.
   3941 /// \param __d
   3942 ///    A single-precision floating-point value used to initialize bits [127:96]
   3943 ///    of the result.
   3944 /// \param __e
   3945 ///    A single-precision floating-point value used to initialize bits [159:128]
   3946 ///    of the result.
   3947 /// \param __f
   3948 ///    A single-precision floating-point value used to initialize bits [191:160]
   3949 ///    of the result.
   3950 /// \param __g
   3951 ///    A single-precision floating-point value used to initialize bits [223:192]
   3952 ///    of the result.
   3953 /// \param __h
   3954 ///    A single-precision floating-point value used to initialize bits [255:224]
   3955 ///    of the result.
   3956 /// \returns An initialized 256-bit floating-point vector of [8 x float].
   3957 static __inline __m256 __DEFAULT_FN_ATTRS
   3958 _mm256_setr_ps(float __a, float __b, float __c, float __d,
   3959                float __e, float __f, float __g, float __h)
   3960 {
   3961   return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
   3962 }
   3963 
   3964 /// \brief Constructs a 256-bit integer vector, initialized in reverse order
   3965 ///    with the specified 32-bit integral values.
   3966 ///
   3967 /// \headerfile <x86intrin.h>
   3968 ///
   3969 /// This intrinsic is a utility function and does not correspond to a specific
   3970 ///   instruction.
   3971 ///
   3972 /// \param __i0
   3973 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
   3974 /// \param __i1
   3975 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
   3976 /// \param __i2
   3977 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
   3978 /// \param __i3
   3979 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
   3980 /// \param __i4
   3981 ///    A 32-bit integral value used to initialize bits [159:128] of the result.
   3982 /// \param __i5
   3983 ///    A 32-bit integral value used to initialize bits [191:160] of the result.
   3984 /// \param __i6
   3985 ///    A 32-bit integral value used to initialize bits [223:192] of the result.
   3986 /// \param __i7
   3987 ///    A 32-bit integral value used to initialize bits [255:224] of the result.
   3988 /// \returns An initialized 256-bit integer vector.
   3989 static __inline __m256i __DEFAULT_FN_ATTRS
   3990 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
   3991                   int __i4, int __i5, int __i6, int __i7)
   3992 {
   3993   return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
   3994 }
   3995 
   3996 /// \brief Constructs a 256-bit integer vector, initialized in reverse order
   3997 ///    with the specified 16-bit integral values.
   3998 ///
   3999 /// \headerfile <x86intrin.h>
   4000 ///
   4001 /// This intrinsic is a utility function and does not correspond to a specific
   4002 ///   instruction.
   4003 ///
   4004 /// \param __w15
   4005 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
   4006 /// \param __w14
   4007 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
   4008 /// \param __w13
   4009 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
   4010 /// \param __w12
   4011 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
   4012 /// \param __w11
   4013 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
   4014 /// \param __w10
   4015 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
   4016 /// \param __w09
   4017 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
   4018 /// \param __w08
   4019 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
   4020 /// \param __w07
   4021 ///    A 16-bit integral value used to initialize bits [143:128] of the result.
   4022 /// \param __w06
   4023 ///    A 16-bit integral value used to initialize bits [159:144] of the result.
   4024 /// \param __w05
   4025 ///    A 16-bit integral value used to initialize bits [175:160] of the result.
   4026 /// \param __w04
   4027 ///    A 16-bit integral value used to initialize bits [191:176] of the result.
   4028 /// \param __w03
   4029 ///    A 16-bit integral value used to initialize bits [207:192] of the result.
   4030 /// \param __w02
   4031 ///    A 16-bit integral value used to initialize bits [223:208] of the result.
   4032 /// \param __w01
   4033 ///    A 16-bit integral value used to initialize bits [239:224] of the result.
   4034 /// \param __w00
   4035 ///    A 16-bit integral value used to initialize bits [255:240] of the result.
   4036 /// \returns An initialized 256-bit integer vector.
   4037 static __inline __m256i __DEFAULT_FN_ATTRS
   4038 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
   4039        short __w11, short __w10, short __w09, short __w08,
   4040        short __w07, short __w06, short __w05, short __w04,
   4041        short __w03, short __w02, short __w01, short __w00)
   4042 {
   4043   return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
   4044     __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
   4045 }
   4046 
   4047 /// \brief Constructs a 256-bit integer vector, initialized in reverse order
   4048 ///    with the specified 8-bit integral values.
   4049 ///
   4050 /// \headerfile <x86intrin.h>
   4051 ///
   4052 /// This intrinsic is a utility function and does not correspond to a specific
   4053 ///   instruction.
   4054 ///
   4055 /// \param __b31
   4056 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
   4057 /// \param __b30
   4058 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
   4059 /// \param __b29
   4060 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
   4061 /// \param __b28
   4062 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
   4063 /// \param __b27
   4064 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
   4065 /// \param __b26
   4066 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
   4067 /// \param __b25
   4068 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
   4069 /// \param __b24
   4070 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
   4071 /// \param __b23
   4072 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
   4073 /// \param __b22
   4074 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
   4075 /// \param __b21
   4076 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
   4077 /// \param __b20
   4078 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
   4079 /// \param __b19
   4080 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
   4081 /// \param __b18
   4082 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
   4083 /// \param __b17
   4084 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
   4085 /// \param __b16
   4086 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
   4087 /// \param __b15
   4088 ///    An 8-bit integral value used to initialize bits [135:128] of the result.
   4089 /// \param __b14
   4090 ///    An 8-bit integral value used to initialize bits [143:136] of the result.
   4091 /// \param __b13
   4092 ///    An 8-bit integral value used to initialize bits [151:144] of the result.
   4093 /// \param __b12
   4094 ///    An 8-bit integral value used to initialize bits [159:152] of the result.
   4095 /// \param __b11
   4096 ///    An 8-bit integral value used to initialize bits [167:160] of the result.
   4097 /// \param __b10
   4098 ///    An 8-bit integral value used to initialize bits [175:168] of the result.
   4099 /// \param __b09
   4100 ///    An 8-bit integral value used to initialize bits [183:176] of the result.
   4101 /// \param __b08
   4102 ///    An 8-bit integral value used to initialize bits [191:184] of the result.
   4103 /// \param __b07
   4104 ///    An 8-bit integral value used to initialize bits [199:192] of the result.
   4105 /// \param __b06
   4106 ///    An 8-bit integral value used to initialize bits [207:200] of the result.
   4107 /// \param __b05
   4108 ///    An 8-bit integral value used to initialize bits [215:208] of the result.
   4109 /// \param __b04
   4110 ///    An 8-bit integral value used to initialize bits [223:216] of the result.
   4111 /// \param __b03
   4112 ///    An 8-bit integral value used to initialize bits [231:224] of the result.
   4113 /// \param __b02
   4114 ///    An 8-bit integral value used to initialize bits [239:232] of the result.
   4115 /// \param __b01
   4116 ///    An 8-bit integral value used to initialize bits [247:240] of the result.
   4117 /// \param __b00
   4118 ///    An 8-bit integral value used to initialize bits [255:248] of the result.
   4119 /// \returns An initialized 256-bit integer vector.
   4120 static __inline __m256i __DEFAULT_FN_ATTRS
   4121 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
   4122                  char __b27, char __b26, char __b25, char __b24,
   4123                  char __b23, char __b22, char __b21, char __b20,
   4124                  char __b19, char __b18, char __b17, char __b16,
   4125                  char __b15, char __b14, char __b13, char __b12,
   4126                  char __b11, char __b10, char __b09, char __b08,
   4127                  char __b07, char __b06, char __b05, char __b04,
   4128                  char __b03, char __b02, char __b01, char __b00)
   4129 {
   4130   return (__m256i)(__v32qi){
   4131     __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
   4132     __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
   4133     __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
   4134     __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
   4135 }
   4136 
   4137 /// \brief Constructs a 256-bit integer vector, initialized in reverse order
   4138 ///    with the specified 64-bit integral values.
   4139 ///
   4140 /// \headerfile <x86intrin.h>
   4141 ///
   4142 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
   4143 ///   instruction.
   4144 ///
   4145 /// \param __a
   4146 ///    A 64-bit integral value used to initialize bits [63:0] of the result.
   4147 /// \param __b
   4148 ///    A 64-bit integral value used to initialize bits [127:64] of the result.
   4149 /// \param __c
   4150 ///    A 64-bit integral value used to initialize bits [191:128] of the result.
   4151 /// \param __d
   4152 ///    A 64-bit integral value used to initialize bits [255:192] of the result.
   4153 /// \returns An initialized 256-bit integer vector.
   4154 static __inline __m256i __DEFAULT_FN_ATTRS
   4155 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
   4156 {
   4157   return (__m256i)(__v4di){ __a, __b, __c, __d };
   4158 }
   4159 
   4160 /* Create vectors with repeated elements */
   4161 /// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
   4162 ///    of the four double-precision floating-point vector elements set to the
   4163 ///    specified double-precision floating-point value.
   4164 ///
   4165 /// \headerfile <x86intrin.h>
   4166 ///
   4167 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
   4168 ///
   4169 /// \param __w
   4170 ///    A double-precision floating-point value used to initialize each vector
   4171 ///    element of the result.
   4172 /// \returns An initialized 256-bit floating-point vector of [4 x double].
   4173 static __inline __m256d __DEFAULT_FN_ATTRS
   4174 _mm256_set1_pd(double __w)
   4175 {
   4176   return (__m256d){ __w, __w, __w, __w };
   4177 }
   4178 
   4179 /// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
   4180 ///    of the eight single-precision floating-point vector elements set to the
   4181 ///    specified single-precision floating-point value.
   4182 ///
   4183 /// \headerfile <x86intrin.h>
   4184 ///
   4185 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
   4186 ///   instruction.
   4187 ///
   4188 /// \param __w
   4189 ///    A single-precision floating-point value used to initialize each vector
   4190 ///    element of the result.
   4191 /// \returns An initialized 256-bit floating-point vector of [8 x float].
   4192 static __inline __m256 __DEFAULT_FN_ATTRS
   4193 _mm256_set1_ps(float __w)
   4194 {
   4195   return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
   4196 }
   4197 
   4198 /// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
   4199 ///    32-bit integral vector elements set to the specified 32-bit integral
   4200 ///    value.
   4201 ///
   4202 /// \headerfile <x86intrin.h>
   4203 ///
   4204 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
   4205 ///   instruction.
   4206 ///
   4207 /// \param __i
   4208 ///    A 32-bit integral value used to initialize each vector element of the
   4209 ///    result.
   4210 /// \returns An initialized 256-bit integer vector of [8 x i32].
   4211 static __inline __m256i __DEFAULT_FN_ATTRS
   4212 _mm256_set1_epi32(int __i)
   4213 {
   4214   return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
   4215 }
   4216 
   4217 /// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
   4218 ///    16-bit integral vector elements set to the specified 16-bit integral
   4219 ///    value.
   4220 ///
   4221 /// \headerfile <x86intrin.h>
   4222 ///
   4223 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
   4224 ///
   4225 /// \param __w
   4226 ///    A 16-bit integral value used to initialize each vector element of the
   4227 ///    result.
   4228 /// \returns An initialized 256-bit integer vector of [16 x i16].
   4229 static __inline __m256i __DEFAULT_FN_ATTRS
   4230 _mm256_set1_epi16(short __w)
   4231 {
   4232   return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
   4233     __w, __w, __w, __w, __w, __w };
   4234 }
   4235 
   4236 /// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
   4237 ///    8-bit integral vector elements set to the specified 8-bit integral value.
   4238 ///
   4239 /// \headerfile <x86intrin.h>
   4240 ///
   4241 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
   4242 ///
   4243 /// \param __b
   4244 ///    An 8-bit integral value used to initialize each vector element of the
   4245 ///    result.
   4246 /// \returns An initialized 256-bit integer vector of [32 x i8].
   4247 static __inline __m256i __DEFAULT_FN_ATTRS
   4248 _mm256_set1_epi8(char __b)
   4249 {
   4250   return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
   4251     __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
   4252     __b, __b, __b, __b, __b, __b, __b };
   4253 }
   4254 
   4255 /// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
   4256 ///    64-bit integral vector elements set to the specified 64-bit integral
   4257 ///    value.
   4258 ///
   4259 /// \headerfile <x86intrin.h>
   4260 ///
   4261 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
   4262 ///
   4263 /// \param __q
   4264 ///    A 64-bit integral value used to initialize each vector element of the
   4265 ///    result.
   4266 /// \returns An initialized 256-bit integer vector of [4 x i64].
   4267 static __inline __m256i __DEFAULT_FN_ATTRS
   4268 _mm256_set1_epi64x(long long __q)
   4269 {
   4270   return (__m256i)(__v4di){ __q, __q, __q, __q };
   4271 }
   4272 
   4273 /* Create __zeroed vectors */
   4274 /// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
   4275 ///    vector elements initialized to zero.
   4276 ///
   4277 /// \headerfile <x86intrin.h>
   4278 ///
   4279 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
   4280 ///
   4281 /// \returns A 256-bit vector of [4 x double] with all elements set to zero.
   4282 static __inline __m256d __DEFAULT_FN_ATTRS
   4283 _mm256_setzero_pd(void)
   4284 {
   4285   return (__m256d){ 0, 0, 0, 0 };
   4286 }
   4287 
   4288 /// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
   4289 ///    vector elements initialized to zero.
   4290 ///
   4291 /// \headerfile <x86intrin.h>
   4292 ///
   4293 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
   4294 ///
   4295 /// \returns A 256-bit vector of [8 x float] with all elements set to zero.
   4296 static __inline __m256 __DEFAULT_FN_ATTRS
   4297 _mm256_setzero_ps(void)
   4298 {
   4299   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
   4300 }
   4301 
   4302 /// \brief Constructs a 256-bit integer vector initialized to zero.
   4303 ///
   4304 /// \headerfile <x86intrin.h>
   4305 ///
   4306 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
   4307 ///
   4308 /// \returns A 256-bit integer vector initialized to zero.
   4309 static __inline __m256i __DEFAULT_FN_ATTRS
   4310 _mm256_setzero_si256(void)
   4311 {
   4312   return (__m256i){ 0LL, 0LL, 0LL, 0LL };
   4313 }
   4314 
   4315 /* Cast between vector types */
   4316 /// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
   4317 ///    floating-point vector of [8 x float].
   4318 ///
   4319 /// \headerfile <x86intrin.h>
   4320 ///
   4321 /// This intrinsic has no corresponding instruction.
   4322 ///
   4323 /// \param __a
   4324 ///    A 256-bit floating-point vector of [4 x double].
   4325 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
   4326 ///    bitwise pattern as the parameter.
   4327 static __inline __m256 __DEFAULT_FN_ATTRS
   4328 _mm256_castpd_ps(__m256d __a)
   4329 {
   4330   return (__m256)__a;
   4331 }
   4332 
   4333 /// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
   4334 ///    integer vector.
   4335 ///
   4336 /// \headerfile <x86intrin.h>
   4337 ///
   4338 /// This intrinsic has no corresponding instruction.
   4339 ///
   4340 /// \param __a
   4341 ///    A 256-bit floating-point vector of [4 x double].
   4342 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
   4343 ///    parameter.
   4344 static __inline __m256i __DEFAULT_FN_ATTRS
   4345 _mm256_castpd_si256(__m256d __a)
   4346 {
   4347   return (__m256i)__a;
   4348 }
   4349 
   4350 /// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
   4351 ///    floating-point vector of [4 x double].
   4352 ///
   4353 /// \headerfile <x86intrin.h>
   4354 ///
   4355 /// This intrinsic has no corresponding instruction.
   4356 ///
   4357 /// \param __a
   4358 ///    A 256-bit floating-point vector of [8 x float].
   4359 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
   4360 ///    bitwise pattern as the parameter.
   4361 static __inline __m256d __DEFAULT_FN_ATTRS
   4362 _mm256_castps_pd(__m256 __a)
   4363 {
   4364   return (__m256d)__a;
   4365 }
   4366 
   4367 /// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
   4368 ///    integer vector.
   4369 ///
   4370 /// \headerfile <x86intrin.h>
   4371 ///
   4372 /// This intrinsic has no corresponding instruction.
   4373 ///
   4374 /// \param __a
   4375 ///    A 256-bit floating-point vector of [8 x float].
   4376 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
   4377 ///    parameter.
   4378 static __inline __m256i __DEFAULT_FN_ATTRS
   4379 _mm256_castps_si256(__m256 __a)
   4380 {
   4381   return (__m256i)__a;
   4382 }
   4383 
   4384 /// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
   4385 ///    of [8 x float].
   4386 ///
   4387 /// \headerfile <x86intrin.h>
   4388 ///
   4389 /// This intrinsic has no corresponding instruction.
   4390 ///
   4391 /// \param __a
   4392 ///    A 256-bit integer vector.
   4393 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
   4394 ///    bitwise pattern as the parameter.
   4395 static __inline __m256 __DEFAULT_FN_ATTRS
   4396 _mm256_castsi256_ps(__m256i __a)
   4397 {
   4398   return (__m256)__a;
   4399 }
   4400 
   4401 /// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
   4402 ///    of [4 x double].
   4403 ///
   4404 /// \headerfile <x86intrin.h>
   4405 ///
   4406 /// This intrinsic has no corresponding instruction.
   4407 ///
   4408 /// \param __a
   4409 ///    A 256-bit integer vector.
   4410 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
   4411 ///    bitwise pattern as the parameter.
   4412 static __inline __m256d __DEFAULT_FN_ATTRS
   4413 _mm256_castsi256_pd(__m256i __a)
   4414 {
   4415   return (__m256d)__a;
   4416 }
   4417 
   4418 /// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
   4419 ///    [4 x double] as a 128-bit floating-point vector of [2 x double].
   4420 ///
   4421 /// \headerfile <x86intrin.h>
   4422 ///
   4423 /// This intrinsic has no corresponding instruction.
   4424 ///
   4425 /// \param __a
   4426 ///    A 256-bit floating-point vector of [4 x double].
   4427 /// \returns A 128-bit floating-point vector of [2 x double] containing the
   4428 ///    lower 128 bits of the parameter.
   4429 static __inline __m128d __DEFAULT_FN_ATTRS
   4430 _mm256_castpd256_pd128(__m256d __a)
   4431 {
   4432   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
   4433 }
   4434 
   4435 /// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
   4436 ///    [8 x float] as a 128-bit floating-point vector of [4 x float].
   4437 ///
   4438 /// \headerfile <x86intrin.h>
   4439 ///
   4440 /// This intrinsic has no corresponding instruction.
   4441 ///
   4442 /// \param __a
   4443 ///    A 256-bit floating-point vector of [8 x float].
   4444 /// \returns A 128-bit floating-point vector of [4 x float] containing the
   4445 ///    lower 128 bits of the parameter.
   4446 static __inline __m128 __DEFAULT_FN_ATTRS
   4447 _mm256_castps256_ps128(__m256 __a)
   4448 {
   4449   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
   4450 }
   4451 
   4452 /// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
   4453 ///
   4454 /// \headerfile <x86intrin.h>
   4455 ///
   4456 /// This intrinsic has no corresponding instruction.
   4457 ///
   4458 /// \param __a
   4459 ///    A 256-bit integer vector.
   4460 /// \returns A 128-bit integer vector containing the lower 128 bits of the
   4461 ///    parameter.
   4462 static __inline __m128i __DEFAULT_FN_ATTRS
   4463 _mm256_castsi256_si128(__m256i __a)
   4464 {
   4465   return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
   4466 }
   4467 
   4468 /// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
   4469 ///    128-bit floating-point vector of [2 x double]. The lower 128 bits
   4470 ///    contain the value of the source vector. The contents of the upper 128
   4471 ///    bits are undefined.
   4472 ///
   4473 /// \headerfile <x86intrin.h>
   4474 ///
   4475 /// This intrinsic has no corresponding instruction.
   4476 ///
   4477 /// \param __a
   4478 ///    A 128-bit vector of [2 x double].
   4479 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
   4480 ///    contain the value of the parameter. The contents of the upper 128 bits
   4481 ///    are undefined.
   4482 static __inline __m256d __DEFAULT_FN_ATTRS
   4483 _mm256_castpd128_pd256(__m128d __a)
   4484 {
   4485   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
   4486 }
   4487 
   4488 /// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
   4489 ///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
   4490 ///    the value of the source vector. The contents of the upper 128 bits are
   4491 ///    undefined.
   4492 ///
   4493 /// \headerfile <x86intrin.h>
   4494 ///
   4495 /// This intrinsic has no corresponding instruction.
   4496 ///
   4497 /// \param __a
   4498 ///    A 128-bit vector of [4 x float].
   4499 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
   4500 ///    contain the value of the parameter. The contents of the upper 128 bits
   4501 ///    are undefined.
   4502 static __inline __m256 __DEFAULT_FN_ATTRS
   4503 _mm256_castps128_ps256(__m128 __a)
   4504 {
   4505   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
   4506 }
   4507 
   4508 /// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
   4509 ///    The lower 128 bits contain the value of the source vector. The contents
   4510 ///    of the upper 128 bits are undefined.
   4511 ///
   4512 /// \headerfile <x86intrin.h>
   4513 ///
   4514 /// This intrinsic has no corresponding instruction.
   4515 ///
   4516 /// \param __a
   4517 ///    A 128-bit integer vector.
   4518 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of
   4519 ///    the parameter. The contents of the upper 128 bits are undefined.
   4520 static __inline __m256i __DEFAULT_FN_ATTRS
   4521 _mm256_castsi128_si256(__m128i __a)
   4522 {
   4523   return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
   4524 }
   4525 
   4526 /*
   4527    Vector insert.
   4528    We use macros rather than inlines because we only want to accept
   4529    invocations where the immediate M is a constant expression.
   4530 */
   4531 /// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
   4532 ///    a 256-bit vector of [8 x float] given in the first parameter, and then
   4533 ///    replacing either the upper or the lower 128 bits with the contents of a
   4534 ///    128-bit vector of [4 x float] in the second parameter. The immediate
   4535 ///    integer parameter determines between the upper or the lower 128 bits.
   4536 ///
   4537 /// \headerfile <x86intrin.h>
   4538 ///
   4539 /// \code
   4540 /// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
   4541 /// \endcode
   4542 ///
   4543 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4544 ///
   4545 /// \param V1
   4546 ///    A 256-bit vector of [8 x float]. This vector is copied to the result
   4547 ///    first, and then either the upper or the lower 128 bits of the result will
   4548 ///    be replaced by the contents of \a V2.
   4549 /// \param V2
   4550 ///    A 128-bit vector of [4 x float]. The contents of this parameter are
   4551 ///    written to either the upper or the lower 128 bits of the result depending
   4552 ///    on the value of parameter \a M.
   4553 /// \param M
   4554 ///    An immediate integer. The least significant bit determines how the values
   4555 ///    from the two parameters are interleaved: \n
   4556 ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
   4557 ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
   4558 ///    result. \n
   4559 ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
   4560 ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
   4561 ///    result.
   4562 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
   4563 #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
   4564   (__m256)__builtin_shufflevector( \
   4565     (__v8sf)(__m256)(V1), \
   4566     (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
   4567     (((M) & 1) ?  0 :  8), \
   4568     (((M) & 1) ?  1 :  9), \
   4569     (((M) & 1) ?  2 : 10), \
   4570     (((M) & 1) ?  3 : 11), \
   4571     (((M) & 1) ?  8 :  4), \
   4572     (((M) & 1) ?  9 :  5), \
   4573     (((M) & 1) ? 10 :  6), \
   4574     (((M) & 1) ? 11 :  7) );})
   4575 
   4576 /// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
   4577 ///    a 256-bit vector of [4 x double] given in the first parameter, and then
   4578 ///    replacing either the upper or the lower 128 bits with the contents of a
   4579 ///    128-bit vector of [2 x double] in the second parameter. The immediate
   4580 ///    integer parameter determines between the upper or the lower 128 bits.
   4581 ///
   4582 /// \headerfile <x86intrin.h>
   4583 ///
   4584 /// \code
   4585 /// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
   4586 /// \endcode
   4587 ///
   4588 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4589 ///
   4590 /// \param V1
   4591 ///    A 256-bit vector of [4 x double]. This vector is copied to the result
   4592 ///    first, and then either the upper or the lower 128 bits of the result will
   4593 ///    be replaced by the contents of \a V2.
   4594 /// \param V2
   4595 ///    A 128-bit vector of [2 x double]. The contents of this parameter are
   4596 ///    written to either the upper or the lower 128 bits of the result depending
   4597 ///    on the value of parameter \a M.
   4598 /// \param M
   4599 ///    An immediate integer. The least significant bit determines how the values
   4600 ///    from the two parameters are interleaved: \n
   4601 ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
   4602 ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
   4603 ///    result. \n
   4604 ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
   4605 ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
   4606 ///    result.
   4607 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
   4608 #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
   4609   (__m256d)__builtin_shufflevector( \
   4610     (__v4df)(__m256d)(V1), \
   4611     (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
   4612     (((M) & 1) ? 0 : 4), \
   4613     (((M) & 1) ? 1 : 5), \
   4614     (((M) & 1) ? 4 : 2), \
   4615     (((M) & 1) ? 5 : 3) );})
   4616 
   4617 /// \brief Constructs a new 256-bit integer vector by first duplicating a
   4618 ///    256-bit integer vector given in the first parameter, and then replacing
   4619 ///    either the upper or the lower 128 bits with the contents of a 128-bit
   4620 ///    integer vector in the second parameter. The immediate integer parameter
   4621 ///    determines between the upper or the lower 128 bits.
   4622 ///
   4623 /// \headerfile <x86intrin.h>
   4624 ///
   4625 /// \code
   4626 /// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
   4627 /// \endcode
   4628 ///
   4629 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4630 ///
   4631 /// \param V1
   4632 ///    A 256-bit integer vector. This vector is copied to the result first, and
   4633 ///    then either the upper or the lower 128 bits of the result will be
   4634 ///    replaced by the contents of \a V2.
   4635 /// \param V2
   4636 ///    A 128-bit integer vector. The contents of this parameter are written to
   4637 ///    either the upper or the lower 128 bits of the result depending on the
   4638 ///     value of parameter \a M.
   4639 /// \param M
   4640 ///    An immediate integer. The least significant bit determines how the values
   4641 ///    from the two parameters are interleaved: \n
   4642 ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
   4643 ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
   4644 ///    result. \n
   4645 ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
   4646 ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
   4647 ///    result.
   4648 /// \returns A 256-bit integer vector containing the interleaved values.
   4649 #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
   4650   (__m256i)__builtin_shufflevector( \
   4651     (__v4di)(__m256i)(V1), \
   4652     (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
   4653     (((M) & 1) ? 0 : 4), \
   4654     (((M) & 1) ? 1 : 5), \
   4655     (((M) & 1) ? 4 : 2), \
   4656     (((M) & 1) ? 5 : 3) );})
   4657 
   4658 /*
   4659    Vector extract.
   4660    We use macros rather than inlines because we only want to accept
   4661    invocations where the immediate M is a constant expression.
   4662 */
   4663 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
   4664 ///    of [8 x float], as determined by the immediate integer parameter, and
   4665 ///    returns the extracted bits as a 128-bit vector of [4 x float].
   4666 ///
   4667 /// \headerfile <x86intrin.h>
   4668 ///
   4669 /// \code
   4670 /// __m128 _mm256_extractf128_ps(__m256 V, const int M);
   4671 /// \endcode
   4672 ///
   4673 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
   4674 ///
   4675 /// \param V
   4676 ///    A 256-bit vector of [8 x float].
   4677 /// \param M
   4678 ///    An immediate integer. The least significant bit determines which bits are
   4679 ///    extracted from the first parameter: \n
   4680 ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
   4681 ///    result. \n
   4682 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
   4683 /// \returns A 128-bit vector of [4 x float] containing the extracted bits.
   4684 #define _mm256_extractf128_ps(V, M) __extension__ ({ \
   4685   (__m128)__builtin_shufflevector( \
   4686     (__v8sf)(__m256)(V), \
   4687     (__v8sf)(_mm256_undefined_ps()), \
   4688     (((M) & 1) ? 4 : 0), \
   4689     (((M) & 1) ? 5 : 1), \
   4690     (((M) & 1) ? 6 : 2), \
   4691     (((M) & 1) ? 7 : 3) );})
   4692 
   4693 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
   4694 ///    of [4 x double], as determined by the immediate integer parameter, and
   4695 ///    returns the extracted bits as a 128-bit vector of [2 x double].
   4696 ///
   4697 /// \headerfile <x86intrin.h>
   4698 ///
   4699 /// \code
   4700 /// __m128d _mm256_extractf128_pd(__m256d V, const int M);
   4701 /// \endcode
   4702 ///
   4703 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
   4704 ///
   4705 /// \param V
   4706 ///    A 256-bit vector of [4 x double].
   4707 /// \param M
   4708 ///    An immediate integer. The least significant bit determines which bits are
   4709 ///    extracted from the first parameter: \n
   4710 ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
   4711 ///    result. \n
   4712 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
   4713 /// \returns A 128-bit vector of [2 x double] containing the extracted bits.
   4714 #define _mm256_extractf128_pd(V, M) __extension__ ({ \
   4715   (__m128d)__builtin_shufflevector( \
   4716     (__v4df)(__m256d)(V), \
   4717     (__v4df)(_mm256_undefined_pd()), \
   4718     (((M) & 1) ? 2 : 0), \
   4719     (((M) & 1) ? 3 : 1) );})
   4720 
   4721 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit
   4722 ///    integer vector, as determined by the immediate integer parameter, and
   4723 ///    returns the extracted bits as a 128-bit integer vector.
   4724 ///
   4725 /// \headerfile <x86intrin.h>
   4726 ///
   4727 /// \code
   4728 /// __m128i _mm256_extractf128_si256(__m256i V, const int M);
   4729 /// \endcode
   4730 ///
   4731 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
   4732 ///
   4733 /// \param V
   4734 ///    A 256-bit integer vector.
   4735 /// \param M
   4736 ///    An immediate integer. The least significant bit determines which bits are
   4737 ///    extracted from the first parameter:  \n
   4738 ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
   4739 ///    result. \n
   4740 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
   4741 /// \returns A 128-bit integer vector containing the extracted bits.
   4742 #define _mm256_extractf128_si256(V, M) __extension__ ({ \
   4743   (__m128i)__builtin_shufflevector( \
   4744     (__v4di)(__m256i)(V), \
   4745     (__v4di)(_mm256_undefined_si256()), \
   4746     (((M) & 1) ? 2 : 0), \
   4747     (((M) & 1) ? 3 : 1) );})
   4748 
   4749 /* SIMD load ops (unaligned) */
   4750 /// \brief Loads two 128-bit floating-point vectors of [4 x float] from
   4751 ///    unaligned memory locations and constructs a 256-bit floating-point vector
   4752 ///    of [8 x float] by concatenating the two 128-bit vectors.
   4753 ///
   4754 /// \headerfile <x86intrin.h>
   4755 ///
   4756 /// This intrinsic corresponds to load instructions followed by the
   4757 ///   <c> VINSERTF128 </c> instruction.
   4758 ///
   4759 /// \param __addr_hi
   4760 ///    A pointer to a 128-bit memory location containing 4 consecutive
   4761 ///    single-precision floating-point values. These values are to be copied to
   4762 ///    bits[255:128] of the result. The address of the memory location does not
   4763 ///    have to be aligned.
   4764 /// \param __addr_lo
   4765 ///    A pointer to a 128-bit memory location containing 4 consecutive
   4766 ///    single-precision floating-point values. These values are to be copied to
   4767 ///    bits[127:0] of the result. The address of the memory location does not
   4768 ///    have to be aligned.
   4769 /// \returns A 256-bit floating-point vector of [8 x float] containing the
   4770 ///    concatenated result.
   4771 static __inline __m256 __DEFAULT_FN_ATTRS
   4772 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
   4773 {
   4774   __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
   4775   return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
   4776 }
   4777 
   4778 /// \brief Loads two 128-bit floating-point vectors of [2 x double] from
   4779 ///    unaligned memory locations and constructs a 256-bit floating-point vector
   4780 ///    of [4 x double] by concatenating the two 128-bit vectors.
   4781 ///
   4782 /// \headerfile <x86intrin.h>
   4783 ///
   4784 /// This intrinsic corresponds to load instructions followed by the
   4785 ///   <c> VINSERTF128 </c> instruction.
   4786 ///
   4787 /// \param __addr_hi
   4788 ///    A pointer to a 128-bit memory location containing two consecutive
   4789 ///    double-precision floating-point values. These values are to be copied to
   4790 ///    bits[255:128] of the result. The address of the memory location does not
   4791 ///    have to be aligned.
   4792 /// \param __addr_lo
   4793 ///    A pointer to a 128-bit memory location containing two consecutive
   4794 ///    double-precision floating-point values. These values are to be copied to
   4795 ///    bits[127:0] of the result. The address of the memory location does not
   4796 ///    have to be aligned.
   4797 /// \returns A 256-bit floating-point vector of [4 x double] containing the
   4798 ///    concatenated result.
   4799 static __inline __m256d __DEFAULT_FN_ATTRS
   4800 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
   4801 {
   4802   __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
   4803   return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
   4804 }
   4805 
   4806 /// \brief Loads two 128-bit integer vectors from unaligned memory locations and
   4807 ///    constructs a 256-bit integer vector by concatenating the two 128-bit
   4808 ///    vectors.
   4809 ///
   4810 /// \headerfile <x86intrin.h>
   4811 ///
   4812 /// This intrinsic corresponds to load instructions followed by the
   4813 ///   <c> VINSERTF128 </c> instruction.
   4814 ///
   4815 /// \param __addr_hi
   4816 ///    A pointer to a 128-bit memory location containing a 128-bit integer
   4817 ///    vector. This vector is to be copied to bits[255:128] of the result. The
   4818 ///    address of the memory location does not have to be aligned.
   4819 /// \param __addr_lo
   4820 ///    A pointer to a 128-bit memory location containing a 128-bit integer
   4821 ///    vector. This vector is to be copied to bits[127:0] of the result. The
   4822 ///    address of the memory location does not have to be aligned.
   4823 /// \returns A 256-bit integer vector containing the concatenated result.
   4824 static __inline __m256i __DEFAULT_FN_ATTRS
   4825 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
   4826 {
   4827   __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
   4828   return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
   4829 }
   4830 
   4831 /* SIMD store ops (unaligned) */
   4832 /// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
   4833 ///    vector of [8 x float] into two different unaligned memory locations.
   4834 ///
   4835 /// \headerfile <x86intrin.h>
   4836 ///
   4837 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
   4838 ///   store instructions.
   4839 ///
   4840 /// \param __addr_hi
   4841 ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
   4842 ///    copied to this memory location. The address of this memory location does
   4843 ///    not have to be aligned.
   4844 /// \param __addr_lo
   4845 ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
   4846 ///    copied to this memory location. The address of this memory location does
   4847 ///    not have to be aligned.
   4848 /// \param __a
   4849 ///    A 256-bit floating-point vector of [8 x float].
   4850 static __inline void __DEFAULT_FN_ATTRS
   4851 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
   4852 {
   4853   __m128 __v128;
   4854 
   4855   __v128 = _mm256_castps256_ps128(__a);
   4856   _mm_storeu_ps(__addr_lo, __v128);
   4857   __v128 = _mm256_extractf128_ps(__a, 1);
   4858   _mm_storeu_ps(__addr_hi, __v128);
   4859 }
   4860 
   4861 /// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
   4862 ///    vector of [4 x double] into two different unaligned memory locations.
   4863 ///
   4864 /// \headerfile <x86intrin.h>
   4865 ///
   4866 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
   4867 ///   store instructions.
   4868 ///
   4869 /// \param __addr_hi
   4870 ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
   4871 ///    copied to this memory location. The address of this memory location does
   4872 ///    not have to be aligned.
   4873 /// \param __addr_lo
   4874 ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
   4875 ///    copied to this memory location. The address of this memory location does
   4876 ///    not have to be aligned.
   4877 /// \param __a
   4878 ///    A 256-bit floating-point vector of [4 x double].
   4879 static __inline void __DEFAULT_FN_ATTRS
   4880 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
   4881 {
   4882   __m128d __v128;
   4883 
   4884   __v128 = _mm256_castpd256_pd128(__a);
   4885   _mm_storeu_pd(__addr_lo, __v128);
   4886   __v128 = _mm256_extractf128_pd(__a, 1);
   4887   _mm_storeu_pd(__addr_hi, __v128);
   4888 }
   4889 
   4890 /// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
   4891 ///    two different unaligned memory locations.
   4892 ///
   4893 /// \headerfile <x86intrin.h>
   4894 ///
   4895 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
   4896 ///   store instructions.
   4897 ///
   4898 /// \param __addr_hi
   4899 ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
   4900 ///    copied to this memory location. The address of this memory location does
   4901 ///    not have to be aligned.
   4902 /// \param __addr_lo
   4903 ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
   4904 ///    copied to this memory location. The address of this memory location does
   4905 ///    not have to be aligned.
   4906 /// \param __a
   4907 ///    A 256-bit integer vector.
   4908 static __inline void __DEFAULT_FN_ATTRS
   4909 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
   4910 {
   4911   __m128i __v128;
   4912 
   4913   __v128 = _mm256_castsi256_si128(__a);
   4914   _mm_storeu_si128(__addr_lo, __v128);
   4915   __v128 = _mm256_extractf128_si256(__a, 1);
   4916   _mm_storeu_si128(__addr_hi, __v128);
   4917 }
   4918 
   4919 /// \brief Constructs a 256-bit floating-point vector of [8 x float] by
   4920 ///    concatenating two 128-bit floating-point vectors of [4 x float].
   4921 ///
   4922 /// \headerfile <x86intrin.h>
   4923 ///
   4924 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4925 ///
   4926 /// \param __hi
   4927 ///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
   4928 ///    128 bits of the result.
   4929 /// \param __lo
   4930 ///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
   4931 ///    128 bits of the result.
   4932 /// \returns A 256-bit floating-point vector of [8 x float] containing the
   4933 ///    concatenated result.
   4934 static __inline __m256 __DEFAULT_FN_ATTRS
   4935 _mm256_set_m128 (__m128 __hi, __m128 __lo)
   4936 {
   4937   return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
   4938 }
   4939 
   4940 /// \brief Constructs a 256-bit floating-point vector of [4 x double] by
   4941 ///    concatenating two 128-bit floating-point vectors of [2 x double].
   4942 ///
   4943 /// \headerfile <x86intrin.h>
   4944 ///
   4945 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4946 ///
   4947 /// \param __hi
   4948 ///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
   4949 ///    128 bits of the result.
   4950 /// \param __lo
   4951 ///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
   4952 ///    128 bits of the result.
   4953 /// \returns A 256-bit floating-point vector of [4 x double] containing the
   4954 ///    concatenated result.
   4955 static __inline __m256d __DEFAULT_FN_ATTRS
   4956 _mm256_set_m128d (__m128d __hi, __m128d __lo)
   4957 {
   4958   return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
   4959 }
   4960 
   4961 /// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
   4962 ///    integer vectors.
   4963 ///
   4964 /// \headerfile <x86intrin.h>
   4965 ///
   4966 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4967 ///
   4968 /// \param __hi
   4969 ///    A 128-bit integer vector to be copied to the upper 128 bits of the
   4970 ///    result.
   4971 /// \param __lo
   4972 ///    A 128-bit integer vector to be copied to the lower 128 bits of the
   4973 ///    result.
   4974 /// \returns A 256-bit integer vector containing the concatenated result.
   4975 static __inline __m256i __DEFAULT_FN_ATTRS
   4976 _mm256_set_m128i (__m128i __hi, __m128i __lo)
   4977 {
   4978   return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
   4979 }
   4980 
   4981 /// \brief Constructs a 256-bit floating-point vector of [8 x float] by
   4982 ///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
   4983 ///    similar to _mm256_set_m128, but the order of the input parameters is
   4984 ///    swapped.
   4985 ///
   4986 /// \headerfile <x86intrin.h>
   4987 ///
   4988 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4989 ///
   4990 /// \param __lo
   4991 ///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
   4992 ///    128 bits of the result.
   4993 /// \param __hi
   4994 ///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
   4995 ///    128 bits of the result.
   4996 /// \returns A 256-bit floating-point vector of [8 x float] containing the
   4997 ///    concatenated result.
   4998 static __inline __m256 __DEFAULT_FN_ATTRS
   4999 _mm256_setr_m128 (__m128 __lo, __m128 __hi)
   5000 {
   5001   return _mm256_set_m128(__hi, __lo);
   5002 }
   5003 
   5004 /// \brief Constructs a 256-bit floating-point vector of [4 x double] by
   5005 ///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
   5006 ///    similar to _mm256_set_m128d, but the order of the input parameters is
   5007 ///    swapped.
   5008 ///
   5009 /// \headerfile <x86intrin.h>
   5010 ///
   5011 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   5012 ///
   5013 /// \param __lo
   5014 ///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
   5015 ///    128 bits of the result.
   5016 /// \param __hi
   5017 ///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
   5018 ///    128 bits of the result.
   5019 /// \returns A 256-bit floating-point vector of [4 x double] containing the
   5020 ///    concatenated result.
   5021 static __inline __m256d __DEFAULT_FN_ATTRS
   5022 _mm256_setr_m128d (__m128d __lo, __m128d __hi)
   5023 {
   5024   return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
   5025 }
   5026 
   5027 /// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
   5028 ///    integer vectors. This is similar to _mm256_set_m128i, but the order of
   5029 ///    the input parameters is swapped.
   5030 ///
   5031 /// \headerfile <x86intrin.h>
   5032 ///
   5033 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   5034 ///
   5035 /// \param __lo
   5036 ///    A 128-bit integer vector to be copied to the lower 128 bits of the
   5037 ///    result.
   5038 /// \param __hi
   5039 ///    A 128-bit integer vector to be copied to the upper 128 bits of the
   5040 ///    result.
   5041 /// \returns A 256-bit integer vector containing the concatenated result.
   5042 static __inline __m256i __DEFAULT_FN_ATTRS
   5043 _mm256_setr_m128i (__m128i __lo, __m128i __hi)
   5044 {
   5045   return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
   5046 }
   5047 
   5048 #undef __DEFAULT_FN_ATTRS
   5049 
   5050 #endif /* __AVXINTRIN_H */
   5051