Home | History | Annotate | Download | only in include
      1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __IMMINTRIN_H
     25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
     26 #endif
     27 
     28 #ifndef __AVXINTRIN_H
     29 #define __AVXINTRIN_H
     30 
     31 typedef double __v4df __attribute__ ((__vector_size__ (32)));
     32 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
     33 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
     34 typedef int __v8si __attribute__ ((__vector_size__ (32)));
     35 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
     36 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
     37 
     38 /* Unsigned types */
     39 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
     40 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
     41 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
     42 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
     43 
     44 /* We need an explicitly signed variant for char. Note that this shouldn't
     45  * appear in the interface though. */
     46 typedef signed char __v32qs __attribute__((__vector_size__(32)));
     47 
     48 typedef float __m256 __attribute__ ((__vector_size__ (32)));
     49 typedef double __m256d __attribute__((__vector_size__(32)));
     50 typedef long long __m256i __attribute__((__vector_size__(32)));
     51 
     52 /* Define the default attributes for the functions in this file. */
     53 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
     54 
     55 /* Arithmetic */
     56 /// \brief Adds two 256-bit vectors of [4 x double].
     57 ///
     58 /// \headerfile <x86intrin.h>
     59 ///
     60 /// This intrinsic corresponds to the <c> VADDPD </c> instruction.
     61 ///
     62 /// \param __a
     63 ///    A 256-bit vector of [4 x double] containing one of the source operands.
     64 /// \param __b
     65 ///    A 256-bit vector of [4 x double] containing one of the source operands.
     66 /// \returns A 256-bit vector of [4 x double] containing the sums of both
     67 ///    operands.
     68 static __inline __m256d __DEFAULT_FN_ATTRS
     69 _mm256_add_pd(__m256d __a, __m256d __b)
     70 {
     71   return (__m256d)((__v4df)__a+(__v4df)__b);
     72 }
     73 
     74 /// \brief Adds two 256-bit vectors of [8 x float].
     75 ///
     76 /// \headerfile <x86intrin.h>
     77 ///
     78 /// This intrinsic corresponds to the <c> VADDPS </c> instruction.
     79 ///
     80 /// \param __a
     81 ///    A 256-bit vector of [8 x float] containing one of the source operands.
     82 /// \param __b
     83 ///    A 256-bit vector of [8 x float] containing one of the source operands.
     84 /// \returns A 256-bit vector of [8 x float] containing the sums of both
     85 ///    operands.
     86 static __inline __m256 __DEFAULT_FN_ATTRS
     87 _mm256_add_ps(__m256 __a, __m256 __b)
     88 {
     89   return (__m256)((__v8sf)__a+(__v8sf)__b);
     90 }
     91 
     92 /// \brief Subtracts two 256-bit vectors of [4 x double].
     93 ///
     94 /// \headerfile <x86intrin.h>
     95 ///
     96 /// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
     97 ///
     98 /// \param __a
     99 ///    A 256-bit vector of [4 x double] containing the minuend.
    100 /// \param __b
    101 ///    A 256-bit vector of [4 x double] containing the subtrahend.
    102 /// \returns A 256-bit vector of [4 x double] containing the differences between
    103 ///    both operands.
    104 static __inline __m256d __DEFAULT_FN_ATTRS
    105 _mm256_sub_pd(__m256d __a, __m256d __b)
    106 {
    107   return (__m256d)((__v4df)__a-(__v4df)__b);
    108 }
    109 
    110 /// \brief Subtracts two 256-bit vectors of [8 x float].
    111 ///
    112 /// \headerfile <x86intrin.h>
    113 ///
    114 /// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
    115 ///
    116 /// \param __a
    117 ///    A 256-bit vector of [8 x float] containing the minuend.
    118 /// \param __b
    119 ///    A 256-bit vector of [8 x float] containing the subtrahend.
    120 /// \returns A 256-bit vector of [8 x float] containing the differences between
    121 ///    both operands.
    122 static __inline __m256 __DEFAULT_FN_ATTRS
    123 _mm256_sub_ps(__m256 __a, __m256 __b)
    124 {
    125   return (__m256)((__v8sf)__a-(__v8sf)__b);
    126 }
    127 
    128 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of
    129 ///    two 256-bit vectors of [4 x double].
    130 ///
    131 /// \headerfile <x86intrin.h>
    132 ///
    133 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
    134 ///
    135 /// \param __a
    136 ///    A 256-bit vector of [4 x double] containing the left source operand.
    137 /// \param __b
    138 ///    A 256-bit vector of [4 x double] containing the right source operand.
    139 /// \returns A 256-bit vector of [4 x double] containing the alternating sums
    140 ///    and differences between both operands.
    141 static __inline __m256d __DEFAULT_FN_ATTRS
    142 _mm256_addsub_pd(__m256d __a, __m256d __b)
    143 {
    144   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
    145 }
    146 
    147 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of
    148 ///    two 256-bit vectors of [8 x float].
    149 ///
    150 /// \headerfile <x86intrin.h>
    151 ///
    152 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
    153 ///
    154 /// \param __a
    155 ///    A 256-bit vector of [8 x float] containing the left source operand.
    156 /// \param __b
    157 ///    A 256-bit vector of [8 x float] containing the right source operand.
    158 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and
    159 ///    differences between both operands.
    160 static __inline __m256 __DEFAULT_FN_ATTRS
    161 _mm256_addsub_ps(__m256 __a, __m256 __b)
    162 {
    163   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
    164 }
    165 
    166 /// \brief Divides two 256-bit vectors of [4 x double].
    167 ///
    168 /// \headerfile <x86intrin.h>
    169 ///
    170 /// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
    171 ///
    172 /// \param __a
    173 ///    A 256-bit vector of [4 x double] containing the dividend.
    174 /// \param __b
    175 ///    A 256-bit vector of [4 x double] containing the divisor.
    176 /// \returns A 256-bit vector of [4 x double] containing the quotients of both
    177 ///    operands.
    178 static __inline __m256d __DEFAULT_FN_ATTRS
    179 _mm256_div_pd(__m256d __a, __m256d __b)
    180 {
    181   return (__m256d)((__v4df)__a/(__v4df)__b);
    182 }
    183 
    184 /// \brief Divides two 256-bit vectors of [8 x float].
    185 ///
    186 /// \headerfile <x86intrin.h>
    187 ///
    188 /// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
    189 ///
    190 /// \param __a
    191 ///    A 256-bit vector of [8 x float] containing the dividend.
    192 /// \param __b
    193 ///    A 256-bit vector of [8 x float] containing the divisor.
    194 /// \returns A 256-bit vector of [8 x float] containing the quotients of both
    195 ///    operands.
    196 static __inline __m256 __DEFAULT_FN_ATTRS
    197 _mm256_div_ps(__m256 __a, __m256 __b)
    198 {
    199   return (__m256)((__v8sf)__a/(__v8sf)__b);
    200 }
    201 
    202 /// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
    203 ///    of each pair of values.
    204 ///
    205 /// \headerfile <x86intrin.h>
    206 ///
    207 /// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
    208 ///
    209 /// \param __a
    210 ///    A 256-bit vector of [4 x double] containing one of the operands.
    211 /// \param __b
    212 ///    A 256-bit vector of [4 x double] containing one of the operands.
    213 /// \returns A 256-bit vector of [4 x double] containing the maximum values
    214 ///    between both operands.
    215 static __inline __m256d __DEFAULT_FN_ATTRS
    216 _mm256_max_pd(__m256d __a, __m256d __b)
    217 {
    218   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
    219 }
    220 
    221 /// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
    222 ///    of each pair of values.
    223 ///
    224 /// \headerfile <x86intrin.h>
    225 ///
    226 /// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
    227 ///
    228 /// \param __a
    229 ///    A 256-bit vector of [8 x float] containing one of the operands.
    230 /// \param __b
    231 ///    A 256-bit vector of [8 x float] containing one of the operands.
    232 /// \returns A 256-bit vector of [8 x float] containing the maximum values
    233 ///    between both operands.
    234 static __inline __m256 __DEFAULT_FN_ATTRS
    235 _mm256_max_ps(__m256 __a, __m256 __b)
    236 {
    237   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
    238 }
    239 
    240 /// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
    241 ///    of each pair of values.
    242 ///
    243 /// \headerfile <x86intrin.h>
    244 ///
    245 /// This intrinsic corresponds to the <c> VMINPD </c> instruction.
    246 ///
    247 /// \param __a
    248 ///    A 256-bit vector of [4 x double] containing one of the operands.
    249 /// \param __b
    250 ///    A 256-bit vector of [4 x double] containing one of the operands.
    251 /// \returns A 256-bit vector of [4 x double] containing the minimum values
    252 ///    between both operands.
    253 static __inline __m256d __DEFAULT_FN_ATTRS
    254 _mm256_min_pd(__m256d __a, __m256d __b)
    255 {
    256   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
    257 }
    258 
    259 /// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
    260 ///    of each pair of values.
    261 ///
    262 /// \headerfile <x86intrin.h>
    263 ///
    264 /// This intrinsic corresponds to the <c> VMINPS </c> instruction.
    265 ///
    266 /// \param __a
    267 ///    A 256-bit vector of [8 x float] containing one of the operands.
    268 /// \param __b
    269 ///    A 256-bit vector of [8 x float] containing one of the operands.
    270 /// \returns A 256-bit vector of [8 x float] containing the minimum values
    271 ///    between both operands.
    272 static __inline __m256 __DEFAULT_FN_ATTRS
    273 _mm256_min_ps(__m256 __a, __m256 __b)
    274 {
    275   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
    276 }
    277 
    278 /// \brief Multiplies two 256-bit vectors of [4 x double].
    279 ///
    280 /// \headerfile <x86intrin.h>
    281 ///
    282 /// This intrinsic corresponds to the <c> VMULPD </c> instruction.
    283 ///
    284 /// \param __a
    285 ///    A 256-bit vector of [4 x double] containing one of the operands.
    286 /// \param __b
    287 ///    A 256-bit vector of [4 x double] containing one of the operands.
    288 /// \returns A 256-bit vector of [4 x double] containing the products of both
    289 ///    operands.
    290 static __inline __m256d __DEFAULT_FN_ATTRS
    291 _mm256_mul_pd(__m256d __a, __m256d __b)
    292 {
    293   return (__m256d)((__v4df)__a * (__v4df)__b);
    294 }
    295 
    296 /// \brief Multiplies two 256-bit vectors of [8 x float].
    297 ///
    298 /// \headerfile <x86intrin.h>
    299 ///
    300 /// This intrinsic corresponds to the <c> VMULPS </c> instruction.
    301 ///
    302 /// \param __a
    303 ///    A 256-bit vector of [8 x float] containing one of the operands.
    304 /// \param __b
    305 ///    A 256-bit vector of [8 x float] containing one of the operands.
    306 /// \returns A 256-bit vector of [8 x float] containing the products of both
    307 ///    operands.
    308 static __inline __m256 __DEFAULT_FN_ATTRS
    309 _mm256_mul_ps(__m256 __a, __m256 __b)
    310 {
    311   return (__m256)((__v8sf)__a * (__v8sf)__b);
    312 }
    313 
    314 /// \brief Calculates the square roots of the values in a 256-bit vector of
    315 ///    [4 x double].
    316 ///
    317 /// \headerfile <x86intrin.h>
    318 ///
    319 /// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
    320 ///
    321 /// \param __a
    322 ///    A 256-bit vector of [4 x double].
    323 /// \returns A 256-bit vector of [4 x double] containing the square roots of the
    324 ///    values in the operand.
    325 static __inline __m256d __DEFAULT_FN_ATTRS
    326 _mm256_sqrt_pd(__m256d __a)
    327 {
    328   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
    329 }
    330 
    331 /// \brief Calculates the square roots of the values in a 256-bit vector of
    332 ///    [8 x float].
    333 ///
    334 /// \headerfile <x86intrin.h>
    335 ///
    336 /// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
    337 ///
    338 /// \param __a
    339 ///    A 256-bit vector of [8 x float].
    340 /// \returns A 256-bit vector of [8 x float] containing the square roots of the
    341 ///    values in the operand.
    342 static __inline __m256 __DEFAULT_FN_ATTRS
    343 _mm256_sqrt_ps(__m256 __a)
    344 {
    345   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
    346 }
    347 
    348 /// \brief Calculates the reciprocal square roots of the values in a 256-bit
    349 ///    vector of [8 x float].
    350 ///
    351 /// \headerfile <x86intrin.h>
    352 ///
    353 /// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
    354 ///
    355 /// \param __a
    356 ///    A 256-bit vector of [8 x float].
    357 /// \returns A 256-bit vector of [8 x float] containing the reciprocal square
    358 ///    roots of the values in the operand.
    359 static __inline __m256 __DEFAULT_FN_ATTRS
    360 _mm256_rsqrt_ps(__m256 __a)
    361 {
    362   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
    363 }
    364 
    365 /// \brief Calculates the reciprocals of the values in a 256-bit vector of
    366 ///    [8 x float].
    367 ///
    368 /// \headerfile <x86intrin.h>
    369 ///
    370 /// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
    371 ///
    372 /// \param __a
    373 ///    A 256-bit vector of [8 x float].
    374 /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
    375 ///    values in the operand.
    376 static __inline __m256 __DEFAULT_FN_ATTRS
    377 _mm256_rcp_ps(__m256 __a)
    378 {
    379   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
    380 }
    381 
    382 /// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
    383 ///    by the byte operand. The source values are rounded to integer values and
    384 ///    returned as 64-bit double-precision floating-point values.
    385 ///
    386 /// \headerfile <x86intrin.h>
    387 ///
    388 /// \code
    389 /// __m256d _mm256_round_pd(__m256d V, const int M);
    390 /// \endcode
    391 ///
    392 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    393 ///
    394 /// \param V
    395 ///    A 256-bit vector of [4 x double].
    396 /// \param M
    397 ///    An integer value that specifies the rounding operation. \n
    398 ///    Bits [7:4] are reserved. \n
    399 ///    Bit [3] is a precision exception value: \n
    400 ///      0: A normal PE exception is used. \n
    401 ///      1: The PE field is not updated. \n
    402 ///    Bit [2] is the rounding control source: \n
    403 ///      0: Use bits [1:0] of \a M. \n
    404 ///      1: Use the current MXCSR setting. \n
    405 ///    Bits [1:0] contain the rounding control definition: \n
    406 ///      00: Nearest. \n
    407 ///      01: Downward (toward negative infinity). \n
    408 ///      10: Upward (toward positive infinity). \n
    409 ///      11: Truncated.
    410 /// \returns A 256-bit vector of [4 x double] containing the rounded values.
    411 #define _mm256_round_pd(V, M) __extension__ ({ \
    412     (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
    413 
    414 /// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
    415 ///    specified by the byte operand. The source values are rounded to integer
    416 ///    values and returned as floating-point values.
    417 ///
    418 /// \headerfile <x86intrin.h>
    419 ///
    420 /// \code
    421 /// __m256 _mm256_round_ps(__m256 V, const int M);
    422 /// \endcode
    423 ///
    424 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    425 ///
    426 /// \param V
    427 ///    A 256-bit vector of [8 x float].
    428 /// \param M
    429 ///    An integer value that specifies the rounding operation. \n
    430 ///    Bits [7:4] are reserved. \n
    431 ///    Bit [3] is a precision exception value: \n
    432 ///      0: A normal PE exception is used. \n
    433 ///      1: The PE field is not updated. \n
    434 ///    Bit [2] is the rounding control source: \n
    435 ///      0: Use bits [1:0] of \a M. \n
    436 ///      1: Use the current MXCSR setting. \n
    437 ///    Bits [1:0] contain the rounding control definition: \n
    438 ///      00: Nearest. \n
    439 ///      01: Downward (toward negative infinity). \n
    440 ///      10: Upward (toward positive infinity). \n
    441 ///      11: Truncated.
    442 /// \returns A 256-bit vector of [8 x float] containing the rounded values.
    443 #define _mm256_round_ps(V, M) __extension__ ({ \
    444   (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
    445 
    446 /// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
    447 ///    source values are rounded up to integer values and returned as 64-bit
    448 ///    double-precision floating-point values.
    449 ///
    450 /// \headerfile <x86intrin.h>
    451 ///
    452 /// \code
    453 /// __m256d _mm256_ceil_pd(__m256d V);
    454 /// \endcode
    455 ///
    456 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    457 ///
    458 /// \param V
    459 ///    A 256-bit vector of [4 x double].
    460 /// \returns A 256-bit vector of [4 x double] containing the rounded up values.
    461 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
    462 
    463 /// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
    464 ///    The source values are rounded down to integer values and returned as
    465 ///    64-bit double-precision floating-point values.
    466 ///
    467 /// \headerfile <x86intrin.h>
    468 ///
    469 /// \code
    470 /// __m256d _mm256_floor_pd(__m256d V);
    471 /// \endcode
    472 ///
    473 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    474 ///
    475 /// \param V
    476 ///    A 256-bit vector of [4 x double].
    477 /// \returns A 256-bit vector of [4 x double] containing the rounded down
    478 ///    values.
    479 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
    480 
    481 /// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
    482 ///    source values are rounded up to integer values and returned as
    483 ///    floating-point values.
    484 ///
    485 /// \headerfile <x86intrin.h>
    486 ///
    487 /// \code
    488 /// __m256 _mm256_ceil_ps(__m256 V);
    489 /// \endcode
    490 ///
    491 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    492 ///
    493 /// \param V
    494 ///    A 256-bit vector of [8 x float].
    495 /// \returns A 256-bit vector of [8 x float] containing the rounded up values.
    496 #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
    497 
    498 /// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
    499 ///    source values are rounded down to integer values and returned as
    500 ///    floating-point values.
    501 ///
    502 /// \headerfile <x86intrin.h>
    503 ///
    504 /// \code
    505 /// __m256 _mm256_floor_ps(__m256 V);
    506 /// \endcode
    507 ///
    508 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    509 ///
    510 /// \param V
    511 ///    A 256-bit vector of [8 x float].
    512 /// \returns A 256-bit vector of [8 x float] containing the rounded down values.
    513 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
    514 
    515 /* Logical */
    516 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
    517 ///
    518 /// \headerfile <x86intrin.h>
    519 ///
    520 /// This intrinsic corresponds to the <c> VANDPD </c> instruction.
    521 ///
    522 /// \param __a
    523 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    524 /// \param __b
    525 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    526 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
    527 ///    values between both operands.
    528 static __inline __m256d __DEFAULT_FN_ATTRS
    529 _mm256_and_pd(__m256d __a, __m256d __b)
    530 {
    531   return (__m256d)((__v4du)__a & (__v4du)__b);
    532 }
    533 
    534 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
    535 ///
    536 /// \headerfile <x86intrin.h>
    537 ///
    538 /// This intrinsic corresponds to the <c> VANDPS </c> instruction.
    539 ///
    540 /// \param __a
    541 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    542 /// \param __b
    543 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    544 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
    545 ///    values between both operands.
    546 static __inline __m256 __DEFAULT_FN_ATTRS
    547 _mm256_and_ps(__m256 __a, __m256 __b)
    548 {
    549   return (__m256)((__v8su)__a & (__v8su)__b);
    550 }
    551 
    552 /// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
    553 ///    the one's complement of the values contained in the first source operand.
    554 ///
    555 /// \headerfile <x86intrin.h>
    556 ///
    557 /// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
    558 ///
    559 /// \param __a
    560 ///    A 256-bit vector of [4 x double] containing the left source operand. The
    561 ///    one's complement of this value is used in the bitwise AND.
    562 /// \param __b
    563 ///    A 256-bit vector of [4 x double] containing the right source operand.
    564 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
    565 ///    values of the second operand and the one's complement of the first
    566 ///    operand.
    567 static __inline __m256d __DEFAULT_FN_ATTRS
    568 _mm256_andnot_pd(__m256d __a, __m256d __b)
    569 {
    570   return (__m256d)(~(__v4du)__a & (__v4du)__b);
    571 }
    572 
    573 /// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
    574 ///    the one's complement of the values contained in the first source operand.
    575 ///
    576 /// \headerfile <x86intrin.h>
    577 ///
    578 /// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
    579 ///
    580 /// \param __a
    581 ///    A 256-bit vector of [8 x float] containing the left source operand. The
    582 ///    one's complement of this value is used in the bitwise AND.
    583 /// \param __b
    584 ///    A 256-bit vector of [8 x float] containing the right source operand.
    585 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
    586 ///    values of the second operand and the one's complement of the first
    587 ///    operand.
    588 static __inline __m256 __DEFAULT_FN_ATTRS
    589 _mm256_andnot_ps(__m256 __a, __m256 __b)
    590 {
    591   return (__m256)(~(__v8su)__a & (__v8su)__b);
    592 }
    593 
    594 /// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
    595 ///
    596 /// \headerfile <x86intrin.h>
    597 ///
    598 /// This intrinsic corresponds to the <c> VORPD </c> instruction.
    599 ///
    600 /// \param __a
    601 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    602 /// \param __b
    603 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    604 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
    605 ///    values between both operands.
    606 static __inline __m256d __DEFAULT_FN_ATTRS
    607 _mm256_or_pd(__m256d __a, __m256d __b)
    608 {
    609   return (__m256d)((__v4du)__a | (__v4du)__b);
    610 }
    611 
    612 /// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
    613 ///
    614 /// \headerfile <x86intrin.h>
    615 ///
    616 /// This intrinsic corresponds to the <c> VORPS </c> instruction.
    617 ///
    618 /// \param __a
    619 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    620 /// \param __b
    621 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    622 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
    623 ///    values between both operands.
    624 static __inline __m256 __DEFAULT_FN_ATTRS
    625 _mm256_or_ps(__m256 __a, __m256 __b)
    626 {
    627   return (__m256)((__v8su)__a | (__v8su)__b);
    628 }
    629 
    630 /// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
    631 ///
    632 /// \headerfile <x86intrin.h>
    633 ///
    634 /// This intrinsic corresponds to the <c> VXORPD </c> instruction.
    635 ///
    636 /// \param __a
    637 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    638 /// \param __b
    639 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    640 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
    641 ///    values between both operands.
    642 static __inline __m256d __DEFAULT_FN_ATTRS
    643 _mm256_xor_pd(__m256d __a, __m256d __b)
    644 {
    645   return (__m256d)((__v4du)__a ^ (__v4du)__b);
    646 }
    647 
    648 /// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
    649 ///
    650 /// \headerfile <x86intrin.h>
    651 ///
    652 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
    653 ///
    654 /// \param __a
    655 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    656 /// \param __b
    657 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    658 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
    659 ///    values between both operands.
    660 static __inline __m256 __DEFAULT_FN_ATTRS
    661 _mm256_xor_ps(__m256 __a, __m256 __b)
    662 {
    663   return (__m256)((__v8su)__a ^ (__v8su)__b);
    664 }
    665 
    666 /* Horizontal arithmetic */
    667 /// \brief Horizontally adds the adjacent pairs of values contained in two
    668 ///    256-bit vectors of [4 x double].
    669 ///
    670 /// \headerfile <x86intrin.h>
    671 ///
    672 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
    673 ///
    674 /// \param __a
    675 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    676 ///    The horizontal sums of the values are returned in the even-indexed
    677 ///    elements of a vector of [4 x double].
    678 /// \param __b
    679 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    680 ///    The horizontal sums of the values are returned in the odd-indexed
    681 ///    elements of a vector of [4 x double].
    682 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
    683 ///    both operands.
    684 static __inline __m256d __DEFAULT_FN_ATTRS
    685 _mm256_hadd_pd(__m256d __a, __m256d __b)
    686 {
    687   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
    688 }
    689 
    690 /// \brief Horizontally adds the adjacent pairs of values contained in two
    691 ///    256-bit vectors of [8 x float].
    692 ///
    693 /// \headerfile <x86intrin.h>
    694 ///
    695 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
    696 ///
    697 /// \param __a
    698 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    699 ///    The horizontal sums of the values are returned in the elements with
    700 ///    index 0, 1, 4, 5 of a vector of [8 x float].
    701 /// \param __b
    702 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    703 ///    The horizontal sums of the values are returned in the elements with
    704 ///    index 2, 3, 6, 7 of a vector of [8 x float].
    705 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
    706 ///    both operands.
    707 static __inline __m256 __DEFAULT_FN_ATTRS
    708 _mm256_hadd_ps(__m256 __a, __m256 __b)
    709 {
    710   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
    711 }
    712 
    713 /// \brief Horizontally subtracts the adjacent pairs of values contained in two
    714 ///    256-bit vectors of [4 x double].
    715 ///
    716 /// \headerfile <x86intrin.h>
    717 ///
    718 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
    719 ///
    720 /// \param __a
    721 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    722 ///    The horizontal differences between the values are returned in the
    723 ///    even-indexed elements of a vector of [4 x double].
    724 /// \param __b
    725 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    726 ///    The horizontal differences between the values are returned in the
    727 ///    odd-indexed elements of a vector of [4 x double].
    728 /// \returns A 256-bit vector of [4 x double] containing the horizontal
    729 ///    differences of both operands.
    730 static __inline __m256d __DEFAULT_FN_ATTRS
    731 _mm256_hsub_pd(__m256d __a, __m256d __b)
    732 {
    733   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
    734 }
    735 
    736 /// \brief Horizontally subtracts the adjacent pairs of values contained in two
    737 ///    256-bit vectors of [8 x float].
    738 ///
    739 /// \headerfile <x86intrin.h>
    740 ///
    741 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
    742 ///
    743 /// \param __a
    744 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    745 ///    The horizontal differences between the values are returned in the
    746 ///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
    747 /// \param __b
    748 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    749 ///    The horizontal differences between the values are returned in the
    750 ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
    751 /// \returns A 256-bit vector of [8 x float] containing the horizontal
    752 ///    differences of both operands.
    753 static __inline __m256 __DEFAULT_FN_ATTRS
    754 _mm256_hsub_ps(__m256 __a, __m256 __b)
    755 {
    756   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
    757 }
    758 
    759 /* Vector permutations */
    760 /// \brief Copies the values in a 128-bit vector of [2 x double] as specified
    761 ///    by the 128-bit integer vector operand.
    762 ///
    763 /// \headerfile <x86intrin.h>
    764 ///
    765 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    766 ///
    767 /// \param __a
    768 ///    A 128-bit vector of [2 x double].
    769 /// \param __c
    770 ///    A 128-bit integer vector operand specifying how the values are to be
    771 ///    copied. \n
    772 ///    Bit [1]: \n
    773 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
    774 ///         vector. \n
    775 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    776 ///         returned vector. \n
    777 ///    Bit [65]: \n
    778 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    779 ///         returned vector. \n
    780 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    781 ///         returned vector.
    782 /// \returns A 128-bit vector of [2 x double] containing the copied values.
    783 static __inline __m128d __DEFAULT_FN_ATTRS
    784 _mm_permutevar_pd(__m128d __a, __m128i __c)
    785 {
    786   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
    787 }
    788 
    789 /// \brief Copies the values in a 256-bit vector of [4 x double] as specified
    790 ///    by the 256-bit integer vector operand.
    791 ///
    792 /// \headerfile <x86intrin.h>
    793 ///
    794 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    795 ///
    796 /// \param __a
    797 ///    A 256-bit vector of [4 x double].
    798 /// \param __c
    799 ///    A 256-bit integer vector operand specifying how the values are to be
    800 ///    copied. \n
    801 ///    Bit [1]: \n
    802 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
    803 ///         vector. \n
    804 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    805 ///         returned vector. \n
    806 ///    Bit [65]: \n
    807 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    808 ///         returned vector. \n
    809 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    810 ///         returned vector. \n
    811 ///    Bit [129]: \n
    812 ///      0: Bits [191:128] of the source are copied to bits [191:128] of the
    813 ///         returned vector. \n
    814 ///      1: Bits [255:192] of the source are copied to bits [191:128] of the
    815 ///         returned vector. \n
    816 ///    Bit [193]: \n
    817 ///      0: Bits [191:128] of the source are copied to bits [255:192] of the
    818 ///         returned vector. \n
    819 ///      1: Bits [255:192] of the source are copied to bits [255:192] of the
    820 ///    returned vector.
    821 /// \returns A 256-bit vector of [4 x double] containing the copied values.
    822 static __inline __m256d __DEFAULT_FN_ATTRS
    823 _mm256_permutevar_pd(__m256d __a, __m256i __c)
    824 {
    825   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
    826 }
    827 
    828 /// \brief Copies the values stored in a 128-bit vector of [4 x float] as
    829 ///    specified by the 128-bit integer vector operand.
    830 /// \headerfile <x86intrin.h>
    831 ///
    832 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
    833 ///
    834 /// \param __a
    835 ///    A 128-bit vector of [4 x float].
    836 /// \param __c
    837 ///    A 128-bit integer vector operand specifying how the values are to be
    838 ///    copied. \n
    839 ///    Bits [1:0]: \n
    840 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
    841 ///          returned vector. \n
    842 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
    843 ///          returned vector. \n
    844 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
    845 ///          returned vector. \n
    846 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
    847 ///          returned vector. \n
    848 ///    Bits [33:32]: \n
    849 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
    850 ///          returned vector. \n
    851 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
    852 ///          returned vector. \n
    853 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
    854 ///          returned vector. \n
    855 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
    856 ///          returned vector. \n
    857 ///    Bits [65:64]: \n
    858 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
    859 ///          returned vector. \n
    860 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
    861 ///          returned vector. \n
    862 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
    863 ///          returned vector. \n
    864 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
    865 ///          returned vector. \n
    866 ///    Bits [97:96]: \n
    867 ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
    868 ///          returned vector. \n
    869 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
    870 ///          returned vector. \n
    871 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
    872 ///          returned vector. \n
    873 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
    874 ///          returned vector.
    875 /// \returns A 128-bit vector of [4 x float] containing the copied values.
    876 static __inline __m128 __DEFAULT_FN_ATTRS
    877 _mm_permutevar_ps(__m128 __a, __m128i __c)
    878 {
    879   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
    880 }
    881 
    882 /// \brief Copies the values stored in a 256-bit vector of [8 x float] as
    883 ///    specified by the 256-bit integer vector operand.
    884 ///
    885 /// \headerfile <x86intrin.h>
    886 ///
    887 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
    888 ///
    889 /// \param __a
    890 ///    A 256-bit vector of [8 x float].
    891 /// \param __c
    892 ///    A 256-bit integer vector operand specifying how the values are to be
    893 ///    copied. \n
    894 ///    Bits [1:0]: \n
    895 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
    896 ///          returned vector. \n
    897 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
    898 ///          returned vector. \n
    899 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
    900 ///          returned vector. \n
    901 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
    902 ///          returned vector. \n
    903 ///    Bits [33:32]: \n
    904 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
    905 ///          returned vector. \n
    906 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
    907 ///          returned vector. \n
    908 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
    909 ///          returned vector. \n
    910 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
    911 ///          returned vector. \n
    912 ///    Bits [65:64]: \n
    913 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
    914 ///          returned vector. \n
    915 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
    916 ///          returned vector. \n
    917 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
    918 ///          returned vector. \n
    919 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
    920 ///          returned vector. \n
    921 ///    Bits [97:96]: \n
    922 ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
    923 ///          returned vector. \n
    924 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
    925 ///          returned vector. \n
    926 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
    927 ///          returned vector. \n
    928 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
    929 ///          returned vector. \n
    930 ///    Bits [129:128]: \n
    931 ///      00: Bits [159:128] of the source are copied to bits [159:128] of the
    932 ///          returned vector. \n
    933 ///      01: Bits [191:160] of the source are copied to bits [159:128] of the
    934 ///          returned vector. \n
    935 ///      10: Bits [223:192] of the source are copied to bits [159:128] of the
    936 ///          returned vector. \n
    937 ///      11: Bits [255:224] of the source are copied to bits [159:128] of the
    938 ///          returned vector. \n
    939 ///    Bits [161:160]: \n
    940 ///      00: Bits [159:128] of the source are copied to bits [191:160] of the
    941 ///          returned vector. \n
    942 ///      01: Bits [191:160] of the source are copied to bits [191:160] of the
    943 ///          returned vector. \n
    944 ///      10: Bits [223:192] of the source are copied to bits [191:160] of the
    945 ///          returned vector. \n
    946 ///      11: Bits [255:224] of the source are copied to bits [191:160] of the
    947 ///          returned vector. \n
    948 ///    Bits [193:192]: \n
    949 ///      00: Bits [159:128] of the source are copied to bits [223:192] of the
    950 ///          returned vector. \n
    951 ///      01: Bits [191:160] of the source are copied to bits [223:192] of the
    952 ///          returned vector. \n
    953 ///      10: Bits [223:192] of the source are copied to bits [223:192] of the
    954 ///          returned vector. \n
    955 ///      11: Bits [255:224] of the source are copied to bits [223:192] of the
    956 ///          returned vector. \n
    957 ///    Bits [225:224]: \n
    958 ///      00: Bits [159:128] of the source are copied to bits [255:224] of the
    959 ///          returned vector. \n
    960 ///      01: Bits [191:160] of the source are copied to bits [255:224] of the
    961 ///          returned vector. \n
    962 ///      10: Bits [223:192] of the source are copied to bits [255:224] of the
    963 ///          returned vector. \n
    964 ///      11: Bits [255:224] of the source are copied to bits [255:224] of the
    965 ///          returned vector.
    966 /// \returns A 256-bit vector of [8 x float] containing the copied values.
    967 static __inline __m256 __DEFAULT_FN_ATTRS
    968 _mm256_permutevar_ps(__m256 __a, __m256i __c)
    969 {
    970   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
    971 }
    972 
    973 /// \brief Copies the values in a 128-bit vector of [2 x double] as specified
    974 ///    by the immediate integer operand.
    975 ///
    976 /// \headerfile <x86intrin.h>
    977 ///
    978 /// \code
    979 /// __m128d _mm_permute_pd(__m128d A, const int C);
    980 /// \endcode
    981 ///
    982 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    983 ///
    984 /// \param A
    985 ///    A 128-bit vector of [2 x double].
    986 /// \param C
    987 ///    An immediate integer operand specifying how the values are to be
    988 ///    copied. \n
    989 ///    Bit [0]: \n
    990 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
    991 ///         vector. \n
    992 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    993 ///         returned vector. \n
    994 ///    Bit [1]: \n
    995 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    996 ///         returned vector. \n
    997 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    998 ///         returned vector.
    999 /// \returns A 128-bit vector of [2 x double] containing the copied values.
   1000 #define _mm_permute_pd(A, C) __extension__ ({ \
   1001   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
   1002                                    (__v2df)_mm_undefined_pd(), \
   1003                                    ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
   1004 
   1005 /// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
   1006 ///    the immediate integer operand.
   1007 ///
   1008 /// \headerfile <x86intrin.h>
   1009 ///
   1010 /// \code
   1011 /// __m256d _mm256_permute_pd(__m256d A, const int C);
   1012 /// \endcode
   1013 ///
   1014 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
   1015 ///
   1016 /// \param A
   1017 ///    A 256-bit vector of [4 x double].
   1018 /// \param C
   1019 ///    An immediate integer operand specifying how the values are to be
   1020 ///    copied. \n
   1021 ///    Bit [0]: \n
   1022 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
   1023 ///         vector. \n
   1024 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
   1025 ///         returned vector. \n
   1026 ///    Bit [1]: \n
   1027 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
   1028 ///         returned vector. \n
   1029 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
   1030 ///         returned vector. \n
   1031 ///    Bit [2]: \n
   1032 ///      0: Bits [191:128] of the source are copied to bits [191:128] of the
   1033 ///         returned vector. \n
   1034 ///      1: Bits [255:192] of the source are copied to bits [191:128] of the
   1035 ///         returned vector. \n
   1036 ///    Bit [3]: \n
   1037 ///      0: Bits [191:128] of the source are copied to bits [255:192] of the
   1038 ///         returned vector. \n
   1039 ///      1: Bits [255:192] of the source are copied to bits [255:192] of the
   1040 ///         returned vector.
   1041 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1042 #define _mm256_permute_pd(A, C) __extension__ ({ \
   1043   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
   1044                                    (__v4df)_mm256_undefined_pd(), \
   1045                                    0 + (((C) >> 0) & 0x1), \
   1046                                    0 + (((C) >> 1) & 0x1), \
   1047                                    2 + (((C) >> 2) & 0x1), \
   1048                                    2 + (((C) >> 3) & 0x1)); })
   1049 
   1050 /// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
   1051 ///    the immediate integer operand.
   1052 ///
   1053 /// \headerfile <x86intrin.h>
   1054 ///
   1055 /// \code
   1056 /// __m128 _mm_permute_ps(__m128 A, const int C);
   1057 /// \endcode
   1058 ///
   1059 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
   1060 ///
   1061 /// \param A
   1062 ///    A 128-bit vector of [4 x float].
   1063 /// \param C
   1064 ///    An immediate integer operand specifying how the values are to be
   1065 ///    copied. \n
   1066 ///    Bits [1:0]: \n
   1067 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
   1068 ///          returned vector. \n
   1069 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
   1070 ///          returned vector. \n
   1071 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
   1072 ///          returned vector. \n
   1073 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
   1074 ///          returned vector. \n
   1075 ///    Bits [3:2]: \n
   1076 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
   1077 ///          returned vector. \n
   1078 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
   1079 ///          returned vector. \n
   1080 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
   1081 ///          returned vector. \n
   1082 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
   1083 ///          returned vector. \n
   1084 ///    Bits [5:4]: \n
   1085 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
   1086 ///          returned vector. \n
   1087 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
   1088 ///          returned vector. \n
   1089 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
   1090 ///          returned vector. \n
   1091 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
   1092 ///          returned vector. \n
   1093 ///    Bits [7:6]: \n
   1094 ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
   1095 ///          returned vector. \n
   1096 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
   1097 ///          returned vector. \n
   1098 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
   1099 ///          returned vector. \n
   1100 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
   1101 ///          returned vector.
   1102 /// \returns A 128-bit vector of [4 x float] containing the copied values.
   1103 #define _mm_permute_ps(A, C) __extension__ ({ \
   1104   (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
   1105                                   (__v4sf)_mm_undefined_ps(), \
   1106                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
   1107                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
   1108 
   1109 /// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
   1110 ///    the immediate integer operand.
   1111 ///
   1112 /// \headerfile <x86intrin.h>
   1113 ///
   1114 /// \code
   1115 /// __m256 _mm256_permute_ps(__m256 A, const int C);
   1116 /// \endcode
   1117 ///
   1118 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
   1119 ///
   1120 /// \param A
   1121 ///    A 256-bit vector of [8 x float].
   1122 /// \param C
   1123 ///    An immediate integer operand specifying how the values are to be \n
   1124 ///    copied. \n
   1125 ///    Bits [1:0]: \n
   1126 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
   1127 ///          returned vector. \n
   1128 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
   1129 ///          returned vector. \n
   1130 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
   1131 ///          returned vector. \n
   1132 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
   1133 ///          returned vector. \n
   1134 ///    Bits [3:2]: \n
   1135 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
   1136 ///          returned vector. \n
   1137 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
   1138 ///          returned vector. \n
   1139 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
   1140 ///          returned vector. \n
   1141 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
   1142 ///          returned vector. \n
   1143 ///    Bits [5:4]: \n
   1144 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
   1145 ///          returned vector. \n
   1146 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
   1147 ///          returned vector. \n
   1148 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
   1149 ///          returned vector. \n
   1150 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
   1151 ///          returned vector. \n
   1152 ///    Bits [7:6]: \n
   1153 ///      00: Bits [31:qq0] of the source are copied to bits [127:96] of the
   1154 ///          returned vector. \n
   1155 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
   1156 ///          returned vector. \n
   1157 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
   1158 ///          returned vector. \n
   1159 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
   1160 ///          returned vector. \n
   1161 ///    Bits [1:0]: \n
   1162 ///      00: Bits [159:128] of the source are copied to bits [159:128] of the
   1163 ///          returned vector. \n
   1164 ///      01: Bits [191:160] of the source are copied to bits [159:128] of the
   1165 ///          returned vector. \n
   1166 ///      10: Bits [223:192] of the source are copied to bits [159:128] of the
   1167 ///          returned vector. \n
   1168 ///      11: Bits [255:224] of the source are copied to bits [159:128] of the
   1169 ///          returned vector. \n
   1170 ///    Bits [3:2]: \n
   1171 ///      00: Bits [159:128] of the source are copied to bits [191:160] of the
   1172 ///          returned vector. \n
   1173 ///      01: Bits [191:160] of the source are copied to bits [191:160] of the
   1174 ///          returned vector. \n
   1175 ///      10: Bits [223:192] of the source are copied to bits [191:160] of the
   1176 ///          returned vector. \n
   1177 ///      11: Bits [255:224] of the source are copied to bits [191:160] of the
   1178 ///          returned vector. \n
   1179 ///    Bits [5:4]: \n
   1180 ///      00: Bits [159:128] of the source are copied to bits [223:192] of the
   1181 ///          returned vector. \n
   1182 ///      01: Bits [191:160] of the source are copied to bits [223:192] of the
   1183 ///          returned vector. \n
   1184 ///      10: Bits [223:192] of the source are copied to bits [223:192] of the
   1185 ///          returned vector. \n
   1186 ///      11: Bits [255:224] of the source are copied to bits [223:192] of the
   1187 ///          returned vector. \n
   1188 ///    Bits [7:6]: \n
   1189 ///      00: Bits [159:128] of the source are copied to bits [255:224] of the
   1190 ///          returned vector. \n
   1191 ///      01: Bits [191:160] of the source are copied to bits [255:224] of the
   1192 ///          returned vector. \n
   1193 ///      10: Bits [223:192] of the source are copied to bits [255:224] of the
   1194 ///          returned vector. \n
   1195 ///      11: Bits [255:224] of the source are copied to bits [255:224] of the
   1196 ///          returned vector.
   1197 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1198 #define _mm256_permute_ps(A, C) __extension__ ({ \
   1199   (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
   1200                                   (__v8sf)_mm256_undefined_ps(), \
   1201                                   0 + (((C) >> 0) & 0x3), \
   1202                                   0 + (((C) >> 2) & 0x3), \
   1203                                   0 + (((C) >> 4) & 0x3), \
   1204                                   0 + (((C) >> 6) & 0x3), \
   1205                                   4 + (((C) >> 0) & 0x3), \
   1206                                   4 + (((C) >> 2) & 0x3), \
   1207                                   4 + (((C) >> 4) & 0x3), \
   1208                                   4 + (((C) >> 6) & 0x3)); })
   1209 
   1210 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of
   1211 ///    [4 x double], as specified by the immediate integer operand.
   1212 ///
   1213 /// \headerfile <x86intrin.h>
   1214 ///
   1215 /// \code
   1216 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
   1217 /// \endcode
   1218 ///
   1219 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
   1220 ///
   1221 /// \param V1
   1222 ///    A 256-bit vector of [4 x double].
   1223 /// \param V2
   1224 ///    A 256-bit vector of [4 x double.
   1225 /// \param M
   1226 ///    An immediate integer operand specifying how the values are to be
   1227 ///    permuted. \n
   1228 ///    Bits [1:0]: \n
   1229 ///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
   1230 ///          destination. \n
   1231 ///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
   1232 ///          destination. \n
   1233 ///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
   1234 ///          destination. \n
   1235 ///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
   1236 ///          destination. \n
   1237 ///    Bits [5:4]: \n
   1238 ///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
   1239 ///          destination. \n
   1240 ///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
   1241 ///          destination. \n
   1242 ///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
   1243 ///          destination. \n
   1244 ///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
   1245 ///          destination.
   1246 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1247 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
   1248   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
   1249                                            (__v4df)(__m256d)(V2), (M)); })
   1250 
   1251 /// \brief Permutes 128-bit data values stored in two 256-bit vectors of
   1252 ///    [8 x float], as specified by the immediate integer operand.
   1253 ///
   1254 /// \headerfile <x86intrin.h>
   1255 ///
   1256 /// \code
   1257 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
   1258 /// \endcode
   1259 ///
   1260 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
   1261 ///
   1262 /// \param V1
   1263 ///    A 256-bit vector of [8 x float].
   1264 /// \param V2
   1265 ///    A 256-bit vector of [8 x float].
   1266 /// \param M
   1267 ///    An immediate integer operand specifying how the values are to be
   1268 ///    permuted. \n
   1269 ///    Bits [1:0]: \n
   1270 ///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
   1271 ///    destination. \n
   1272 ///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
   1273 ///    destination. \n
   1274 ///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
   1275 ///    destination. \n
   1276 ///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
   1277 ///    destination. \n
   1278 ///    Bits [5:4]: \n
   1279 ///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
   1280 ///    destination. \n
   1281 ///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
   1282 ///    destination. \n
   1283 ///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
   1284 ///    destination. \n
   1285 ///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
   1286 ///    destination.
   1287 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1288 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
   1289   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
   1290                                           (__v8sf)(__m256)(V2), (M)); })
   1291 
   1292 /// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
   1293 ///    as specified by the immediate integer operand.
   1294 ///
   1295 /// \headerfile <x86intrin.h>
   1296 ///
   1297 /// \code
   1298 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
   1299 /// \endcode
   1300 ///
   1301 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
   1302 ///
   1303 /// \param V1
   1304 ///    A 256-bit integer vector.
   1305 /// \param V2
   1306 ///    A 256-bit integer vector.
   1307 /// \param M
   1308 ///    An immediate integer operand specifying how the values are to be copied.
   1309 ///    Bits [1:0]: \n
   1310 ///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
   1311 ///    destination. \n
   1312 ///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
   1313 ///    destination. \n
   1314 ///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
   1315 ///    destination. \n
   1316 ///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
   1317 ///    destination. \n
   1318 ///    Bits [5:4]: \n
   1319 ///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
   1320 ///    destination. \n
   1321 ///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
   1322 ///    destination. \n
   1323 ///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
   1324 ///    destination. \n
   1325 ///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
   1326 ///    destination.
   1327 /// \returns A 256-bit integer vector containing the copied values.
   1328 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
   1329   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
   1330                                            (__v8si)(__m256i)(V2), (M)); })
   1331 
   1332 /* Vector Blend */
   1333 /// \brief Merges 64-bit double-precision data values stored in either of the
   1334 ///    two 256-bit vectors of [4 x double], as specified by the immediate
   1335 ///    integer operand.
   1336 ///
   1337 /// \headerfile <x86intrin.h>
   1338 ///
   1339 /// \code
   1340 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
   1341 /// \endcode
   1342 ///
   1343 /// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
   1344 ///
   1345 /// \param V1
   1346 ///    A 256-bit vector of [4 x double].
   1347 /// \param V2
   1348 ///    A 256-bit vector of [4 x double].
   1349 /// \param M
   1350 ///    An immediate integer operand, with mask bits [3:0] specifying how the
   1351 ///    values are to be copied. The position of the mask bit corresponds to the
   1352 ///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
   1353 ///    element in operand \a V1 is copied to the same position in the
   1354 ///    destination. When a mask bit is 1, the corresponding 64-bit element in
   1355 ///    operand \a V2 is copied to the same position in the destination.
   1356 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1357 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
   1358   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
   1359                                    (__v4df)(__m256d)(V2), \
   1360                                    (((M) & 0x01) ? 4 : 0), \
   1361                                    (((M) & 0x02) ? 5 : 1), \
   1362                                    (((M) & 0x04) ? 6 : 2), \
   1363                                    (((M) & 0x08) ? 7 : 3)); })
   1364 
   1365 /// \brief Merges 32-bit single-precision data values stored in either of the
   1366 ///    two 256-bit vectors of [8 x float], as specified by the immediate
   1367 ///    integer operand.
   1368 ///
   1369 /// \headerfile <x86intrin.h>
   1370 ///
   1371 /// \code
   1372 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
   1373 /// \endcode
   1374 ///
   1375 /// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
   1376 ///
   1377 /// \param V1
   1378 ///    A 256-bit vector of [8 x float].
   1379 /// \param V2
   1380 ///    A 256-bit vector of [8 x float].
   1381 /// \param M
   1382 ///    An immediate integer operand, with mask bits [7:0] specifying how the
   1383 ///    values are to be copied. The position of the mask bit corresponds to the
   1384 ///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
   1385 ///    element in operand \a V1 is copied to the same position in the
   1386 ///    destination. When a mask bit is 1, the corresponding 32-bit element in
   1387 ///    operand \a V2 is copied to the same position in the destination.
   1388 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1389 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
   1390   (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
   1391                                   (__v8sf)(__m256)(V2), \
   1392                                   (((M) & 0x01) ?  8 : 0), \
   1393                                   (((M) & 0x02) ?  9 : 1), \
   1394                                   (((M) & 0x04) ? 10 : 2), \
   1395                                   (((M) & 0x08) ? 11 : 3), \
   1396                                   (((M) & 0x10) ? 12 : 4), \
   1397                                   (((M) & 0x20) ? 13 : 5), \
   1398                                   (((M) & 0x40) ? 14 : 6), \
   1399                                   (((M) & 0x80) ? 15 : 7)); })
   1400 
   1401 /// \brief Merges 64-bit double-precision data values stored in either of the
   1402 ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
   1403 ///    operand.
   1404 ///
   1405 /// \headerfile <x86intrin.h>
   1406 ///
   1407 /// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
   1408 ///
   1409 /// \param __a
   1410 ///    A 256-bit vector of [4 x double].
   1411 /// \param __b
   1412 ///    A 256-bit vector of [4 x double].
   1413 /// \param __c
   1414 ///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
   1415 ///    how the values are to be copied. The position of the mask bit corresponds
   1416 ///    to the most significant bit of a copied value. When a mask bit is 0, the
   1417 ///    corresponding 64-bit element in operand \a __a is copied to the same
   1418 ///    position in the destination. When a mask bit is 1, the corresponding
   1419 ///    64-bit element in operand \a __b is copied to the same position in the
   1420 ///    destination.
   1421 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1422 static __inline __m256d __DEFAULT_FN_ATTRS
   1423 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
   1424 {
   1425   return (__m256d)__builtin_ia32_blendvpd256(
   1426     (__v4df)__a, (__v4df)__b, (__v4df)__c);
   1427 }
   1428 
   1429 /// \brief Merges 32-bit single-precision data values stored in either of the
   1430 ///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
   1431 ///    operand.
   1432 ///
   1433 /// \headerfile <x86intrin.h>
   1434 ///
   1435 /// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
   1436 ///
   1437 /// \param __a
   1438 ///    A 256-bit vector of [8 x float].
   1439 /// \param __b
   1440 ///    A 256-bit vector of [8 x float].
   1441 /// \param __c
   1442 ///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
   1443 ///    and 31 specifying how the values are to be copied. The position of the
   1444 ///    mask bit corresponds to the most significant bit of a copied value. When
   1445 ///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
   1446 ///    copied to the same position in the destination. When a mask bit is 1, the
   1447 ///    corresponding 32-bit element in operand \a __b is copied to the same
   1448 ///    position in the destination.
   1449 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1450 static __inline __m256 __DEFAULT_FN_ATTRS
   1451 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
   1452 {
   1453   return (__m256)__builtin_ia32_blendvps256(
   1454     (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
   1455 }
   1456 
   1457 /* Vector Dot Product */
   1458 /// \brief Computes two dot products in parallel, using the lower and upper
   1459 ///    halves of two [8 x float] vectors as input to the two computations, and
   1460 ///    returning the two dot products in the lower and upper halves of the
   1461 ///    [8 x float] result.
   1462 ///
   1463 ///    The immediate integer operand controls which input elements will
   1464 ///    contribute to the dot product, and where the final results are returned.
   1465 ///    In general, for each dot product, the four corresponding elements of the
   1466 ///    input vectors are multiplied; the first two and second two products are
   1467 ///    summed, then the two sums are added to form the final result.
   1468 ///
   1469 /// \headerfile <x86intrin.h>
   1470 ///
   1471 /// \code
   1472 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
   1473 /// \endcode
   1474 ///
   1475 /// This intrinsic corresponds to the <c> VDPPS </c> instruction.
   1476 ///
   1477 /// \param V1
   1478 ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
   1479 /// \param V2
   1480 ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
   1481 /// \param M
   1482 ///    An immediate integer argument. Bits [7:4] determine which elements of
   1483 ///    the input vectors are used, with bit [4] corresponding to the lowest
   1484 ///    element and bit [7] corresponding to the highest element of each [4 x
   1485 ///    float] subvector. If a bit is set, the corresponding elements from the
   1486 ///    two input vectors are used as an input for dot product; otherwise that
   1487 ///    input is treated as zero. Bits [3:0] determine which elements of the
   1488 ///    result will receive a copy of the final dot product, with bit [0]
   1489 ///    corresponding to the lowest element and bit [3] corresponding to the
   1490 ///    highest element of each [4 x float] subvector. If a bit is set, the dot
   1491 ///    product is returned in the corresponding element; otherwise that element
   1492 ///    is set to zero. The bitmask is applied in the same way to each of the
   1493 ///    two parallel dot product computations.
   1494 /// \returns A 256-bit vector of [8 x float] containing the two dot products.
   1495 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
   1496   (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
   1497                                  (__v8sf)(__m256)(V2), (M)); })
   1498 
   1499 /* Vector shuffle */
   1500 /// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
   1501 ///    specified by the immediate value operand.
   1502 ///
   1503 ///    The four selected elements in each operand are copied to the destination
   1504 ///    according to the bits specified in the immediate operand. The selected
   1505 ///    elements from the first 256-bit operand are copied to bits [63:0] and
   1506 ///    bits [191:128] of the destination, and the selected elements from the
   1507 ///    second 256-bit operand are copied to bits [127:64] and bits [255:192] of
   1508 ///    the destination. For example, if bits [7:0] of the immediate operand
   1509 ///    contain a value of 0xFF, the 256-bit destination vector would contain the
   1510 ///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
   1511 ///
   1512 /// \headerfile <x86intrin.h>
   1513 ///
   1514 /// \code
   1515 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
   1516 /// \endcode
   1517 ///
   1518 /// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
   1519 ///
   1520 /// \param a
   1521 ///    A 256-bit vector of [8 x float]. The four selected elements in this
   1522 ///    operand are copied to bits [63:0] and bits [191:128] in the destination,
   1523 ///    according to the bits specified in the immediate operand.
   1524 /// \param b
   1525 ///    A 256-bit vector of [8 x float]. The four selected elements in this
   1526 ///    operand are copied to bits [127:64] and bits [255:192] in the
   1527 ///    destination, according to the bits specified in the immediate operand.
   1528 /// \param mask
   1529 ///    An immediate value containing an 8-bit value specifying which elements to
   1530 ///    copy from \a a and \a b \n.
   1531 ///    Bits [3:0] specify the values copied from operand \a a. \n
   1532 ///    Bits [7:4] specify the values copied from operand \a b. \n
   1533 ///    The destinations within the 256-bit destination are assigned values as
   1534 ///    follows, according to the bit value assignments described below: \n
   1535 ///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
   1536 ///    destination. \n
   1537 ///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
   1538 ///    destination. \n
   1539 ///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
   1540 ///    destination. \n
   1541 ///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
   1542 ///    the destination. \n
   1543 ///    Bit value assignments: \n
   1544 ///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
   1545 ///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
   1546 ///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
   1547 ///    11: Bits [127:96] and [255:224] are copied from the selected operand.
   1548 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
   1549 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
   1550   (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
   1551                                   (__v8sf)(__m256)(b), \
   1552                                   0  + (((mask) >> 0) & 0x3), \
   1553                                   0  + (((mask) >> 2) & 0x3), \
   1554                                   8  + (((mask) >> 4) & 0x3), \
   1555                                   8  + (((mask) >> 6) & 0x3), \
   1556                                   4  + (((mask) >> 0) & 0x3), \
   1557                                   4  + (((mask) >> 2) & 0x3), \
   1558                                   12 + (((mask) >> 4) & 0x3), \
   1559                                   12 + (((mask) >> 6) & 0x3)); })
   1560 
   1561 /// \brief Selects four double-precision values from the 256-bit operands of
   1562 ///    [4 x double], as specified by the immediate value operand.
   1563 ///
   1564 ///    The selected elements from the first 256-bit operand are copied to bits
   1565 ///    [63:0] and bits [191:128] in the destination, and the selected elements
   1566 ///    from the second 256-bit operand are copied to bits [127:64] and bits
   1567 ///    [255:192] in the destination. For example, if bits [3:0] of the immediate
   1568 ///    operand contain a value of 0xF, the 256-bit destination vector would
   1569 ///    contain the following values: b[3], a[3], b[1], a[1].
   1570 ///
   1571 /// \headerfile <x86intrin.h>
   1572 ///
   1573 /// \code
   1574 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
   1575 /// \endcode
   1576 ///
   1577 /// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
   1578 ///
   1579 /// \param a
   1580 ///    A 256-bit vector of [4 x double].
   1581 /// \param b
   1582 ///    A 256-bit vector of [4 x double].
   1583 /// \param mask
   1584 ///    An immediate value containing 8-bit values specifying which elements to
   1585 ///    copy from \a a and \a b: \n
   1586 ///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
   1587 ///    destination. \n
   1588 ///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
   1589 ///    destination. \n
   1590 ///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
   1591 ///    destination. \n
   1592 ///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
   1593 ///    destination. \n
   1594 ///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
   1595 ///    destination. \n
   1596 ///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
   1597 ///    destination. \n
   1598 ///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
   1599 ///    destination. \n
   1600 ///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
   1601 ///    destination.
   1602 /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
   1603 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
   1604   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
   1605                                    (__v4df)(__m256d)(b), \
   1606                                    0 + (((mask) >> 0) & 0x1), \
   1607                                    4 + (((mask) >> 1) & 0x1), \
   1608                                    2 + (((mask) >> 2) & 0x1), \
   1609                                    6 + (((mask) >> 3) & 0x1)); })
   1610 
   1611 /* Compare */
   1612 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
   1613 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
   1614 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
   1615 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
   1616 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
   1617 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
   1618 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
   1619 #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
   1620 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
   1621 #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
   1622 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
   1623 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
   1624 #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
   1625 #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
   1626 #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
   1627 #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
   1628 #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
   1629 #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
   1630 #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
   1631 #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
   1632 #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
   1633 #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
   1634 #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
   1635 #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
   1636 #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
   1637 #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
   1638 #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
   1639 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
   1640 #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
   1641 #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
   1642 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
   1643 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
   1644 
   1645 /// \brief Compares each of the corresponding double-precision values of two
   1646 ///    128-bit vectors of [2 x double], using the operation specified by the
   1647 ///    immediate integer operand.
   1648 ///
   1649 ///    Returns a [2 x double] vector consisting of two doubles corresponding to
   1650 ///    the two comparison results: zero if the comparison is false, and all 1's
   1651 ///    if the comparison is true.
   1652 ///
   1653 /// \headerfile <x86intrin.h>
   1654 ///
   1655 /// \code
   1656 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
   1657 /// \endcode
   1658 ///
   1659 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
   1660 ///
   1661 /// \param a
   1662 ///    A 128-bit vector of [2 x double].
   1663 /// \param b
   1664 ///    A 128-bit vector of [2 x double].
   1665 /// \param c
   1666 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1667 ///    operation to use: \n
   1668 ///    0x00 : Equal (ordered, non-signaling)
   1669 ///    0x01 : Less-than (ordered, signaling)
   1670 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1671 ///    0x03 : Unordered (non-signaling)
   1672 ///    0x04 : Not-equal (unordered, non-signaling)
   1673 ///    0x05 : Not-less-than (unordered, signaling)
   1674 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1675 ///    0x07 : Ordered (non-signaling)
   1676 ///    0x08 : Equal (unordered, non-signaling)
   1677 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1678 ///    0x0a : Not-greater-than (unordered, signaling)
   1679 ///    0x0b : False (ordered, non-signaling)
   1680 ///    0x0c : Not-equal (ordered, non-signaling)
   1681 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1682 ///    0x0e : Greater-than (ordered, signaling)
   1683 ///    0x0f : True (unordered, non-signaling)
   1684 ///    0x10 : Equal (ordered, signaling)
   1685 ///    0x11 : Less-than (ordered, non-signaling)
   1686 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1687 ///    0x13 : Unordered (signaling)
   1688 ///    0x14 : Not-equal (unordered, signaling)
   1689 ///    0x15 : Not-less-than (unordered, non-signaling)
   1690 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1691 ///    0x17 : Ordered (signaling)
   1692 ///    0x18 : Equal (unordered, signaling)
   1693 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1694 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1695 ///    0x1b : False (ordered, signaling)
   1696 ///    0x1c : Not-equal (ordered, signaling)
   1697 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1698 ///    0x1e : Greater-than (ordered, non-signaling)
   1699 ///    0x1f : True (unordered, signaling)
   1700 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
   1701 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
   1702   (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
   1703                                 (__v2df)(__m128d)(b), (c)); })
   1704 
   1705 /// \brief Compares each of the corresponding values of two 128-bit vectors of
   1706 ///    [4 x float], using the operation specified by the immediate integer
   1707 ///    operand.
   1708 ///
   1709 ///    Returns a [4 x float] vector consisting of four floats corresponding to
   1710 ///    the four comparison results: zero if the comparison is false, and all 1's
   1711 ///    if the comparison is true.
   1712 ///
   1713 /// \headerfile <x86intrin.h>
   1714 ///
   1715 /// \code
   1716 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
   1717 /// \endcode
   1718 ///
   1719 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
   1720 ///
   1721 /// \param a
   1722 ///    A 128-bit vector of [4 x float].
   1723 /// \param b
   1724 ///    A 128-bit vector of [4 x float].
   1725 /// \param c
   1726 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1727 ///    operation to use: \n
   1728 ///    0x00 : Equal (ordered, non-signaling)
   1729 ///    0x01 : Less-than (ordered, signaling)
   1730 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1731 ///    0x03 : Unordered (non-signaling)
   1732 ///    0x04 : Not-equal (unordered, non-signaling)
   1733 ///    0x05 : Not-less-than (unordered, signaling)
   1734 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1735 ///    0x07 : Ordered (non-signaling)
   1736 ///    0x08 : Equal (unordered, non-signaling)
   1737 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1738 ///    0x0a : Not-greater-than (unordered, signaling)
   1739 ///    0x0b : False (ordered, non-signaling)
   1740 ///    0x0c : Not-equal (ordered, non-signaling)
   1741 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1742 ///    0x0e : Greater-than (ordered, signaling)
   1743 ///    0x0f : True (unordered, non-signaling)
   1744 ///    0x10 : Equal (ordered, signaling)
   1745 ///    0x11 : Less-than (ordered, non-signaling)
   1746 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1747 ///    0x13 : Unordered (signaling)
   1748 ///    0x14 : Not-equal (unordered, signaling)
   1749 ///    0x15 : Not-less-than (unordered, non-signaling)
   1750 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1751 ///    0x17 : Ordered (signaling)
   1752 ///    0x18 : Equal (unordered, signaling)
   1753 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1754 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1755 ///    0x1b : False (ordered, signaling)
   1756 ///    0x1c : Not-equal (ordered, signaling)
   1757 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1758 ///    0x1e : Greater-than (ordered, non-signaling)
   1759 ///    0x1f : True (unordered, signaling)
   1760 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   1761 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
   1762   (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
   1763                                (__v4sf)(__m128)(b), (c)); })
   1764 
   1765 /// \brief Compares each of the corresponding double-precision values of two
   1766 ///    256-bit vectors of [4 x double], using the operation specified by the
   1767 ///    immediate integer operand.
   1768 ///
   1769 ///    Returns a [4 x double] vector consisting of four doubles corresponding to
   1770 ///    the four comparison results: zero if the comparison is false, and all 1's
   1771 ///    if the comparison is true.
   1772 ///
   1773 /// \headerfile <x86intrin.h>
   1774 ///
   1775 /// \code
   1776 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
   1777 /// \endcode
   1778 ///
   1779 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
   1780 ///
   1781 /// \param a
   1782 ///    A 256-bit vector of [4 x double].
   1783 /// \param b
   1784 ///    A 256-bit vector of [4 x double].
   1785 /// \param c
   1786 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1787 ///    operation to use: \n
   1788 ///    0x00 : Equal (ordered, non-signaling)
   1789 ///    0x01 : Less-than (ordered, signaling)
   1790 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1791 ///    0x03 : Unordered (non-signaling)
   1792 ///    0x04 : Not-equal (unordered, non-signaling)
   1793 ///    0x05 : Not-less-than (unordered, signaling)
   1794 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1795 ///    0x07 : Ordered (non-signaling)
   1796 ///    0x08 : Equal (unordered, non-signaling)
   1797 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1798 ///    0x0a : Not-greater-than (unordered, signaling)
   1799 ///    0x0b : False (ordered, non-signaling)
   1800 ///    0x0c : Not-equal (ordered, non-signaling)
   1801 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1802 ///    0x0e : Greater-than (ordered, signaling)
   1803 ///    0x0f : True (unordered, non-signaling)
   1804 ///    0x10 : Equal (ordered, signaling)
   1805 ///    0x11 : Less-than (ordered, non-signaling)
   1806 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1807 ///    0x13 : Unordered (signaling)
   1808 ///    0x14 : Not-equal (unordered, signaling)
   1809 ///    0x15 : Not-less-than (unordered, non-signaling)
   1810 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1811 ///    0x17 : Ordered (signaling)
   1812 ///    0x18 : Equal (unordered, signaling)
   1813 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1814 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1815 ///    0x1b : False (ordered, signaling)
   1816 ///    0x1c : Not-equal (ordered, signaling)
   1817 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1818 ///    0x1e : Greater-than (ordered, non-signaling)
   1819 ///    0x1f : True (unordered, signaling)
   1820 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
   1821 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
   1822   (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
   1823                                    (__v4df)(__m256d)(b), (c)); })
   1824 
   1825 /// \brief Compares each of the corresponding values of two 256-bit vectors of
   1826 ///    [8 x float], using the operation specified by the immediate integer
   1827 ///    operand.
   1828 ///
   1829 ///    Returns a [8 x float] vector consisting of eight floats corresponding to
   1830 ///    the eight comparison results: zero if the comparison is false, and all
   1831 ///    1's if the comparison is true.
   1832 ///
   1833 /// \headerfile <x86intrin.h>
   1834 ///
   1835 /// \code
   1836 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
   1837 /// \endcode
   1838 ///
   1839 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
   1840 ///
   1841 /// \param a
   1842 ///    A 256-bit vector of [8 x float].
   1843 /// \param b
   1844 ///    A 256-bit vector of [8 x float].
   1845 /// \param c
   1846 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1847 ///    operation to use: \n
   1848 ///    0x00 : Equal (ordered, non-signaling)
   1849 ///    0x01 : Less-than (ordered, signaling)
   1850 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1851 ///    0x03 : Unordered (non-signaling)
   1852 ///    0x04 : Not-equal (unordered, non-signaling)
   1853 ///    0x05 : Not-less-than (unordered, signaling)
   1854 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1855 ///    0x07 : Ordered (non-signaling)
   1856 ///    0x08 : Equal (unordered, non-signaling)
   1857 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1858 ///    0x0a : Not-greater-than (unordered, signaling)
   1859 ///    0x0b : False (ordered, non-signaling)
   1860 ///    0x0c : Not-equal (ordered, non-signaling)
   1861 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1862 ///    0x0e : Greater-than (ordered, signaling)
   1863 ///    0x0f : True (unordered, non-signaling)
   1864 ///    0x10 : Equal (ordered, signaling)
   1865 ///    0x11 : Less-than (ordered, non-signaling)
   1866 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1867 ///    0x13 : Unordered (signaling)
   1868 ///    0x14 : Not-equal (unordered, signaling)
   1869 ///    0x15 : Not-less-than (unordered, non-signaling)
   1870 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1871 ///    0x17 : Ordered (signaling)
   1872 ///    0x18 : Equal (unordered, signaling)
   1873 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1874 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1875 ///    0x1b : False (ordered, signaling)
   1876 ///    0x1c : Not-equal (ordered, signaling)
   1877 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1878 ///    0x1e : Greater-than (ordered, non-signaling)
   1879 ///    0x1f : True (unordered, signaling)
   1880 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
   1881 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
   1882   (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
   1883                                   (__v8sf)(__m256)(b), (c)); })
   1884 
   1885 /// \brief Compares each of the corresponding scalar double-precision values of
   1886 ///    two 128-bit vectors of [2 x double], using the operation specified by the
   1887 ///    immediate integer operand.
   1888 ///
   1889 ///    If the result is true, all 64 bits of the destination vector are set;
   1890 ///    otherwise they are cleared.
   1891 ///
   1892 /// \headerfile <x86intrin.h>
   1893 ///
   1894 /// \code
   1895 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
   1896 /// \endcode
   1897 ///
   1898 /// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
   1899 ///
   1900 /// \param a
   1901 ///    A 128-bit vector of [2 x double].
   1902 /// \param b
   1903 ///    A 128-bit vector of [2 x double].
   1904 /// \param c
   1905 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1906 ///    operation to use: \n
   1907 ///    0x00 : Equal (ordered, non-signaling)
   1908 ///    0x01 : Less-than (ordered, signaling)
   1909 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1910 ///    0x03 : Unordered (non-signaling)
   1911 ///    0x04 : Not-equal (unordered, non-signaling)
   1912 ///    0x05 : Not-less-than (unordered, signaling)
   1913 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1914 ///    0x07 : Ordered (non-signaling)
   1915 ///    0x08 : Equal (unordered, non-signaling)
   1916 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1917 ///    0x0a : Not-greater-than (unordered, signaling)
   1918 ///    0x0b : False (ordered, non-signaling)
   1919 ///    0x0c : Not-equal (ordered, non-signaling)
   1920 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1921 ///    0x0e : Greater-than (ordered, signaling)
   1922 ///    0x0f : True (unordered, non-signaling)
   1923 ///    0x10 : Equal (ordered, signaling)
   1924 ///    0x11 : Less-than (ordered, non-signaling)
   1925 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1926 ///    0x13 : Unordered (signaling)
   1927 ///    0x14 : Not-equal (unordered, signaling)
   1928 ///    0x15 : Not-less-than (unordered, non-signaling)
   1929 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1930 ///    0x17 : Ordered (signaling)
   1931 ///    0x18 : Equal (unordered, signaling)
   1932 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1933 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1934 ///    0x1b : False (ordered, signaling)
   1935 ///    0x1c : Not-equal (ordered, signaling)
   1936 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1937 ///    0x1e : Greater-than (ordered, non-signaling)
   1938 ///    0x1f : True (unordered, signaling)
   1939 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
   1940 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
   1941   (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
   1942                                 (__v2df)(__m128d)(b), (c)); })
   1943 
   1944 /// \brief Compares each of the corresponding scalar values of two 128-bit
   1945 ///    vectors of [4 x float], using the operation specified by the immediate
   1946 ///    integer operand.
   1947 ///
   1948 ///    If the result is true, all 32 bits of the destination vector are set;
   1949 ///    otherwise they are cleared.
   1950 ///
   1951 /// \headerfile <x86intrin.h>
   1952 ///
   1953 /// \code
   1954 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
   1955 /// \endcode
   1956 ///
   1957 /// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
   1958 ///
   1959 /// \param a
   1960 ///    A 128-bit vector of [4 x float].
   1961 /// \param b
   1962 ///    A 128-bit vector of [4 x float].
   1963 /// \param c
   1964 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1965 ///    operation to use: \n
   1966 ///    0x00 : Equal (ordered, non-signaling)
   1967 ///    0x01 : Less-than (ordered, signaling)
   1968 ///    0x02 : Less-than-or-equal (ordered, signaling)
   1969 ///    0x03 : Unordered (non-signaling)
   1970 ///    0x04 : Not-equal (unordered, non-signaling)
   1971 ///    0x05 : Not-less-than (unordered, signaling)
   1972 ///    0x06 : Not-less-than-or-equal (unordered, signaling)
   1973 ///    0x07 : Ordered (non-signaling)
   1974 ///    0x08 : Equal (unordered, non-signaling)
   1975 ///    0x09 : Not-greater-than-or-equal (unordered, signaling)
   1976 ///    0x0a : Not-greater-than (unordered, signaling)
   1977 ///    0x0b : False (ordered, non-signaling)
   1978 ///    0x0c : Not-equal (ordered, non-signaling)
   1979 ///    0x0d : Greater-than-or-equal (ordered, signaling)
   1980 ///    0x0e : Greater-than (ordered, signaling)
   1981 ///    0x0f : True (unordered, non-signaling)
   1982 ///    0x10 : Equal (ordered, signaling)
   1983 ///    0x11 : Less-than (ordered, non-signaling)
   1984 ///    0x12 : Less-than-or-equal (ordered, non-signaling)
   1985 ///    0x13 : Unordered (signaling)
   1986 ///    0x14 : Not-equal (unordered, signaling)
   1987 ///    0x15 : Not-less-than (unordered, non-signaling)
   1988 ///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
   1989 ///    0x17 : Ordered (signaling)
   1990 ///    0x18 : Equal (unordered, signaling)
   1991 ///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
   1992 ///    0x1a : Not-greater-than (unordered, non-signaling)
   1993 ///    0x1b : False (ordered, signaling)
   1994 ///    0x1c : Not-equal (ordered, signaling)
   1995 ///    0x1d : Greater-than-or-equal (ordered, non-signaling)
   1996 ///    0x1e : Greater-than (ordered, non-signaling)
   1997 ///    0x1f : True (unordered, signaling)
   1998 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   1999 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
   2000   (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
   2001                                (__v4sf)(__m128)(b), (c)); })
   2002 
   2003 /// \brief Takes a [8 x i32] vector and returns the vector element value
   2004 ///    indexed by the immediate constant operand.
   2005 ///
   2006 /// \headerfile <x86intrin.h>
   2007 ///
   2008 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   2009 ///   instruction.
   2010 ///
   2011 /// \param __a
   2012 ///    A 256-bit vector of [8 x i32].
   2013 /// \param __imm
   2014 ///    An immediate integer operand with bits [2:0] determining which vector
   2015 ///    element is extracted and returned.
   2016 /// \returns A 32-bit integer containing the extracted 32 bits of extended
   2017 ///    packed data.
   2018 static __inline int __DEFAULT_FN_ATTRS
   2019 _mm256_extract_epi32(__m256i __a, const int __imm)
   2020 {
   2021   __v8si __b = (__v8si)__a;
   2022   return __b[__imm & 7];
   2023 }
   2024 
   2025 /// \brief Takes a [16 x i16] vector and returns the vector element value
   2026 ///    indexed by the immediate constant operand.
   2027 ///
   2028 /// \headerfile <x86intrin.h>
   2029 ///
   2030 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   2031 ///   instruction.
   2032 ///
   2033 /// \param __a
   2034 ///    A 256-bit integer vector of [16 x i16].
   2035 /// \param __imm
   2036 ///    An immediate integer operand with bits [3:0] determining which vector
   2037 ///    element is extracted and returned.
   2038 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
   2039 ///    packed data.
   2040 static __inline int __DEFAULT_FN_ATTRS
   2041 _mm256_extract_epi16(__m256i __a, const int __imm)
   2042 {
   2043   __v16hi __b = (__v16hi)__a;
   2044   return (unsigned short)__b[__imm & 15];
   2045 }
   2046 
   2047 /// \brief Takes a [32 x i8] vector and returns the vector element value
   2048 ///    indexed by the immediate constant operand.
   2049 ///
   2050 /// \headerfile <x86intrin.h>
   2051 ///
   2052 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   2053 ///   instruction.
   2054 ///
   2055 /// \param __a
   2056 ///    A 256-bit integer vector of [32 x i8].
   2057 /// \param __imm
   2058 ///    An immediate integer operand with bits [4:0] determining which vector
   2059 ///    element is extracted and returned.
   2060 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
   2061 ///    packed data.
   2062 static __inline int __DEFAULT_FN_ATTRS
   2063 _mm256_extract_epi8(__m256i __a, const int __imm)
   2064 {
   2065   __v32qi __b = (__v32qi)__a;
   2066   return (unsigned char)__b[__imm & 31];
   2067 }
   2068 
   2069 #ifdef __x86_64__
   2070 /// \brief Takes a [4 x i64] vector and returns the vector element value
   2071 ///    indexed by the immediate constant operand.
   2072 ///
   2073 /// \headerfile <x86intrin.h>
   2074 ///
   2075 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   2076 ///   instruction.
   2077 ///
   2078 /// \param __a
   2079 ///    A 256-bit integer vector of [4 x i64].
   2080 /// \param __imm
   2081 ///    An immediate integer operand with bits [1:0] determining which vector
   2082 ///    element is extracted and returned.
   2083 /// \returns A 64-bit integer containing the extracted 64 bits of extended
   2084 ///    packed data.
   2085 static __inline long long  __DEFAULT_FN_ATTRS
   2086 _mm256_extract_epi64(__m256i __a, const int __imm)
   2087 {
   2088   __v4di __b = (__v4di)__a;
   2089   return __b[__imm & 3];
   2090 }
   2091 #endif
   2092 
   2093 /// \brief Takes a [8 x i32] vector and replaces the vector element value
   2094 ///    indexed by the immediate constant operand by a new value. Returns the
   2095 ///    modified vector.
   2096 ///
   2097 /// \headerfile <x86intrin.h>
   2098 ///
   2099 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2100 ///   instruction.
   2101 ///
   2102 /// \param __a
   2103 ///    A vector of [8 x i32] to be used by the insert operation.
   2104 /// \param __b
   2105 ///    An integer value. The replacement value for the insert operation.
   2106 /// \param __imm
   2107 ///    An immediate integer specifying the index of the vector element to be
   2108 ///    replaced.
   2109 /// \returns A copy of vector \a __a, after replacing its element indexed by
   2110 ///    \a __imm with \a __b.
   2111 static __inline __m256i __DEFAULT_FN_ATTRS
   2112 _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
   2113 {
   2114   __v8si __c = (__v8si)__a;
   2115   __c[__imm & 7] = __b;
   2116   return (__m256i)__c;
   2117 }
   2118 
   2119 
   2120 /// \brief Takes a [16 x i16] vector and replaces the vector element value
   2121 ///    indexed by the immediate constant operand with a new value. Returns the
   2122 ///    modified vector.
   2123 ///
   2124 /// \headerfile <x86intrin.h>
   2125 ///
   2126 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2127 ///   instruction.
   2128 ///
   2129 /// \param __a
   2130 ///    A vector of [16 x i16] to be used by the insert operation.
   2131 /// \param __b
   2132 ///    An i16 integer value. The replacement value for the insert operation.
   2133 /// \param __imm
   2134 ///    An immediate integer specifying the index of the vector element to be
   2135 ///    replaced.
   2136 /// \returns A copy of vector \a __a, after replacing its element indexed by
   2137 ///    \a __imm with \a __b.
   2138 static __inline __m256i __DEFAULT_FN_ATTRS
   2139 _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
   2140 {
   2141   __v16hi __c = (__v16hi)__a;
   2142   __c[__imm & 15] = __b;
   2143   return (__m256i)__c;
   2144 }
   2145 
   2146 /// \brief Takes a [32 x i8] vector and replaces the vector element value
   2147 ///    indexed by the immediate constant operand with a new value. Returns the
   2148 ///    modified vector.
   2149 ///
   2150 /// \headerfile <x86intrin.h>
   2151 ///
   2152 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2153 ///   instruction.
   2154 ///
   2155 /// \param __a
   2156 ///    A vector of [32 x i8] to be used by the insert operation.
   2157 /// \param __b
   2158 ///    An i8 integer value. The replacement value for the insert operation.
   2159 /// \param __imm
   2160 ///    An immediate integer specifying the index of the vector element to be
   2161 ///    replaced.
   2162 /// \returns A copy of vector \a __a, after replacing its element indexed by
   2163 ///    \a __imm with \a __b.
   2164 static __inline __m256i __DEFAULT_FN_ATTRS
   2165 _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
   2166 {
   2167   __v32qi __c = (__v32qi)__a;
   2168   __c[__imm & 31] = __b;
   2169   return (__m256i)__c;
   2170 }
   2171 
   2172 #ifdef __x86_64__
   2173 /// \brief Takes a [4 x i64] vector and replaces the vector element value
   2174 ///    indexed by the immediate constant operand with a new value. Returns the
   2175 ///    modified vector.
   2176 ///
   2177 /// \headerfile <x86intrin.h>
   2178 ///
   2179 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2180 ///   instruction.
   2181 ///
   2182 /// \param __a
   2183 ///    A vector of [4 x i64] to be used by the insert operation.
   2184 /// \param __b
   2185 ///    A 64-bit integer value. The replacement value for the insert operation.
   2186 /// \param __imm
   2187 ///    An immediate integer specifying the index of the vector element to be
   2188 ///    replaced.
   2189 /// \returns A copy of vector \a __a, after replacing its element indexed by
   2190 ///     \a __imm with \a __b.
   2191 static __inline __m256i __DEFAULT_FN_ATTRS
   2192 _mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
   2193 {
   2194   __v4di __c = (__v4di)__a;
   2195   __c[__imm & 3] = __b;
   2196   return (__m256i)__c;
   2197 }
   2198 #endif
   2199 
   2200 /* Conversion */
   2201 /// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
   2202 ///
   2203 /// \headerfile <x86intrin.h>
   2204 ///
   2205 /// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
   2206 ///
   2207 /// \param __a
   2208 ///    A 128-bit integer vector of [4 x i32].
   2209 /// \returns A 256-bit vector of [4 x double] containing the converted values.
   2210 static __inline __m256d __DEFAULT_FN_ATTRS
   2211 _mm256_cvtepi32_pd(__m128i __a)
   2212 {
   2213   return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
   2214 }
   2215 
   2216 /// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
   2217 ///
   2218 /// \headerfile <x86intrin.h>
   2219 ///
   2220 /// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
   2221 ///
   2222 /// \param __a
   2223 ///    A 256-bit integer vector.
   2224 /// \returns A 256-bit vector of [8 x float] containing the converted values.
   2225 static __inline __m256 __DEFAULT_FN_ATTRS
   2226 _mm256_cvtepi32_ps(__m256i __a)
   2227 {
   2228   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
   2229 }
   2230 
   2231 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
   2232 ///    [4 x float].
   2233 ///
   2234 /// \headerfile <x86intrin.h>
   2235 ///
   2236 /// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
   2237 ///
   2238 /// \param __a
   2239 ///    A 256-bit vector of [4 x double].
   2240 /// \returns A 128-bit vector of [4 x float] containing the converted values.
   2241 static __inline __m128 __DEFAULT_FN_ATTRS
   2242 _mm256_cvtpd_ps(__m256d __a)
   2243 {
   2244   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
   2245 }
   2246 
   2247 /// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
   2248 ///
   2249 /// \headerfile <x86intrin.h>
   2250 ///
   2251 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
   2252 ///
   2253 /// \param __a
   2254 ///    A 256-bit vector of [8 x float].
   2255 /// \returns A 256-bit integer vector containing the converted values.
   2256 static __inline __m256i __DEFAULT_FN_ATTRS
   2257 _mm256_cvtps_epi32(__m256 __a)
   2258 {
   2259   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
   2260 }
   2261 
   2262 /// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
   2263 ///    x double].
   2264 ///
   2265 /// \headerfile <x86intrin.h>
   2266 ///
   2267 /// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
   2268 ///
   2269 /// \param __a
   2270 ///    A 128-bit vector of [4 x float].
   2271 /// \returns A 256-bit vector of [4 x double] containing the converted values.
   2272 static __inline __m256d __DEFAULT_FN_ATTRS
   2273 _mm256_cvtps_pd(__m128 __a)
   2274 {
   2275   return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
   2276 }
   2277 
   2278 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
   2279 ///    x i32], truncating the result by rounding towards zero when it is
   2280 ///    inexact.
   2281 ///
   2282 /// \headerfile <x86intrin.h>
   2283 ///
   2284 /// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
   2285 ///
   2286 /// \param __a
   2287 ///    A 256-bit vector of [4 x double].
   2288 /// \returns A 128-bit integer vector containing the converted values.
   2289 static __inline __m128i __DEFAULT_FN_ATTRS
   2290 _mm256_cvttpd_epi32(__m256d __a)
   2291 {
   2292   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
   2293 }
   2294 
   2295 /// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
   2296 ///    x i32]. When a conversion is inexact, the value returned is rounded
   2297 ///    according to the rounding control bits in the MXCSR register.
   2298 ///
   2299 /// \headerfile <x86intrin.h>
   2300 ///
   2301 /// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
   2302 ///
   2303 /// \param __a
   2304 ///    A 256-bit vector of [4 x double].
   2305 /// \returns A 128-bit integer vector containing the converted values.
   2306 static __inline __m128i __DEFAULT_FN_ATTRS
   2307 _mm256_cvtpd_epi32(__m256d __a)
   2308 {
   2309   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
   2310 }
   2311 
   2312 /// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
   2313 ///    truncating the result by rounding towards zero when it is inexact.
   2314 ///
   2315 /// \headerfile <x86intrin.h>
   2316 ///
   2317 /// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
   2318 ///
   2319 /// \param __a
   2320 ///    A 256-bit vector of [8 x float].
   2321 /// \returns A 256-bit integer vector containing the converted values.
   2322 static __inline __m256i __DEFAULT_FN_ATTRS
   2323 _mm256_cvttps_epi32(__m256 __a)
   2324 {
   2325   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
   2326 }
   2327 
   2328 /// \brief Returns the first element of the input vector of [4 x double].
   2329 ///
   2330 /// \headerfile <avxintrin.h>
   2331 ///
   2332 /// This intrinsic is a utility function and does not correspond to a specific
   2333 ///    instruction.
   2334 ///
   2335 /// \param __a
   2336 ///    A 256-bit vector of [4 x double].
   2337 /// \returns A 64 bit double containing the first element of the input vector.
   2338 static __inline double __DEFAULT_FN_ATTRS
   2339 _mm256_cvtsd_f64(__m256d __a)
   2340 {
   2341  return __a[0];
   2342 }
   2343 
   2344 /// \brief Returns the first element of the input vector of [8 x i32].
   2345 ///
   2346 /// \headerfile <avxintrin.h>
   2347 ///
   2348 /// This intrinsic is a utility function and does not correspond to a specific
   2349 ///    instruction.
   2350 ///
   2351 /// \param __a
   2352 ///    A 256-bit vector of [8 x i32].
   2353 /// \returns A 32 bit integer containing the first element of the input vector.
   2354 static __inline int __DEFAULT_FN_ATTRS
   2355 _mm256_cvtsi256_si32(__m256i __a)
   2356 {
   2357  __v8si __b = (__v8si)__a;
   2358  return __b[0];
   2359 }
   2360 
   2361 /// \brief Returns the first element of the input vector of [8 x float].
   2362 ///
   2363 /// \headerfile <avxintrin.h>
   2364 ///
   2365 /// This intrinsic is a utility function and does not correspond to a specific
   2366 ///    instruction.
   2367 ///
   2368 /// \param __a
   2369 ///    A 256-bit vector of [8 x float].
   2370 /// \returns A 32 bit float containing the first element of the input vector.
   2371 static __inline float __DEFAULT_FN_ATTRS
   2372 _mm256_cvtss_f32(__m256 __a)
   2373 {
   2374  return __a[0];
   2375 }
   2376 
   2377 /* Vector replicate */
   2378 /// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
   2379 ///    vector of [8 x float] to float values in a 256-bit vector of
   2380 ///    [8 x float].
   2381 ///
   2382 /// \headerfile <x86intrin.h>
   2383 ///
   2384 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
   2385 ///
   2386 /// \param __a
   2387 ///    A 256-bit vector of [8 x float]. \n
   2388 ///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
   2389 ///    the return value. \n
   2390 ///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
   2391 ///    the return value. \n
   2392 ///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
   2393 ///    return value. \n
   2394 ///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
   2395 ///    return value.
   2396 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
   2397 ///    values.
   2398 static __inline __m256 __DEFAULT_FN_ATTRS
   2399 _mm256_movehdup_ps(__m256 __a)
   2400 {
   2401   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
   2402 }
   2403 
   2404 /// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
   2405 ///    vector of [8 x float] to float values in a 256-bit vector of [8 x float].
   2406 ///
   2407 /// \headerfile <x86intrin.h>
   2408 ///
   2409 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
   2410 ///
   2411 /// \param __a
   2412 ///    A 256-bit vector of [8 x float]. \n
   2413 ///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
   2414 ///    the return value. \n
   2415 ///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
   2416 ///    the return value. \n
   2417 ///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
   2418 ///    return value. \n
   2419 ///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
   2420 ///    return value.
   2421 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
   2422 ///    values.
   2423 static __inline __m256 __DEFAULT_FN_ATTRS
   2424 _mm256_moveldup_ps(__m256 __a)
   2425 {
   2426   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
   2427 }
   2428 
   2429 /// \brief Moves and duplicates double-precision floating point values from a
   2430 ///    256-bit vector of [4 x double] to double-precision values in a 256-bit
   2431 ///    vector of [4 x double].
   2432 ///
   2433 /// \headerfile <x86intrin.h>
   2434 ///
   2435 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
   2436 ///
   2437 /// \param __a
   2438 ///    A 256-bit vector of [4 x double]. \n
   2439 ///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
   2440 ///    return value. \n
   2441 ///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
   2442 ///    the return value.
   2443 /// \returns A 256-bit vector of [4 x double] containing the moved and
   2444 ///    duplicated values.
   2445 static __inline __m256d __DEFAULT_FN_ATTRS
   2446 _mm256_movedup_pd(__m256d __a)
   2447 {
   2448   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
   2449 }
   2450 
   2451 /* Unpack and Interleave */
   2452 /// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
   2453 ///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
   2454 ///
   2455 /// \headerfile <x86intrin.h>
   2456 ///
   2457 /// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
   2458 ///
   2459 /// \param __a
   2460 ///    A 256-bit floating-point vector of [4 x double]. \n
   2461 ///    Bits [127:64] are written to bits [63:0] of the return value. \n
   2462 ///    Bits [255:192] are written to bits [191:128] of the return value. \n
   2463 /// \param __b
   2464 ///    A 256-bit floating-point vector of [4 x double]. \n
   2465 ///    Bits [127:64] are written to bits [127:64] of the return value. \n
   2466 ///    Bits [255:192] are written to bits [255:192] of the return value. \n
   2467 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
   2468 static __inline __m256d __DEFAULT_FN_ATTRS
   2469 _mm256_unpackhi_pd(__m256d __a, __m256d __b)
   2470 {
   2471   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
   2472 }
   2473 
   2474 /// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
   2475 ///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
   2476 ///
   2477 /// \headerfile <x86intrin.h>
   2478 ///
   2479 /// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
   2480 ///
   2481 /// \param __a
   2482 ///    A 256-bit floating-point vector of [4 x double]. \n
   2483 ///    Bits [63:0] are written to bits [63:0] of the return value. \n
   2484 ///    Bits [191:128] are written to bits [191:128] of the return value.
   2485 /// \param __b
   2486 ///    A 256-bit floating-point vector of [4 x double]. \n
   2487 ///    Bits [63:0] are written to bits [127:64] of the return value. \n
   2488 ///    Bits [191:128] are written to bits [255:192] of the return value. \n
   2489 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
   2490 static __inline __m256d __DEFAULT_FN_ATTRS
   2491 _mm256_unpacklo_pd(__m256d __a, __m256d __b)
   2492 {
   2493   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
   2494 }
   2495 
   2496 /// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
   2497 ///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
   2498 ///    vector of [8 x float].
   2499 ///
   2500 /// \headerfile <x86intrin.h>
   2501 ///
   2502 /// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
   2503 ///
   2504 /// \param __a
   2505 ///    A 256-bit vector of [8 x float]. \n
   2506 ///    Bits [95:64] are written to bits [31:0] of the return value. \n
   2507 ///    Bits [127:96] are written to bits [95:64] of the return value. \n
   2508 ///    Bits [223:192] are written to bits [159:128] of the return value. \n
   2509 ///    Bits [255:224] are written to bits [223:192] of the return value.
   2510 /// \param __b
   2511 ///    A 256-bit vector of [8 x float]. \n
   2512 ///    Bits [95:64] are written to bits [63:32] of the return value. \n
   2513 ///    Bits [127:96] are written to bits [127:96] of the return value. \n
   2514 ///    Bits [223:192] are written to bits [191:160] of the return value. \n
   2515 ///    Bits [255:224] are written to bits [255:224] of the return value.
   2516 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
   2517 static __inline __m256 __DEFAULT_FN_ATTRS
   2518 _mm256_unpackhi_ps(__m256 __a, __m256 __b)
   2519 {
   2520   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
   2521 }
   2522 
   2523 /// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
   2524 ///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
   2525 ///    vector of [8 x float].
   2526 ///
   2527 /// \headerfile <x86intrin.h>
   2528 ///
   2529 /// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
   2530 ///
   2531 /// \param __a
   2532 ///    A 256-bit vector of [8 x float]. \n
   2533 ///    Bits [31:0] are written to bits [31:0] of the return value. \n
   2534 ///    Bits [63:32] are written to bits [95:64] of the return value. \n
   2535 ///    Bits [159:128] are written to bits [159:128] of the return value. \n
   2536 ///    Bits [191:160] are written to bits [223:192] of the return value.
   2537 /// \param __b
   2538 ///    A 256-bit vector of [8 x float]. \n
   2539 ///    Bits [31:0] are written to bits [63:32] of the return value. \n
   2540 ///    Bits [63:32] are written to bits [127:96] of the return value. \n
   2541 ///    Bits [159:128] are written to bits [191:160] of the return value. \n
   2542 ///    Bits [191:160] are written to bits [255:224] of the return value.
   2543 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
   2544 static __inline __m256 __DEFAULT_FN_ATTRS
   2545 _mm256_unpacklo_ps(__m256 __a, __m256 __b)
   2546 {
   2547   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
   2548 }
   2549 
   2550 /* Bit Test */
   2551 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
   2552 ///    element-by-element comparison of the double-precision element in the
   2553 ///    first source vector and the corresponding element in the second source
   2554 ///    vector.
   2555 ///
   2556 ///    The EFLAGS register is updated as follows: \n
   2557 ///    If there is at least one pair of double-precision elements where the
   2558 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2559 ///    ZF flag is set to 1. \n
   2560 ///    If there is at least one pair of double-precision elements where the
   2561 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2562 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2563 ///    This intrinsic returns the value of the ZF flag.
   2564 ///
   2565 /// \headerfile <x86intrin.h>
   2566 ///
   2567 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2568 ///
   2569 /// \param __a
   2570 ///    A 128-bit vector of [2 x double].
   2571 /// \param __b
   2572 ///    A 128-bit vector of [2 x double].
   2573 /// \returns the ZF flag in the EFLAGS register.
   2574 static __inline int __DEFAULT_FN_ATTRS
   2575 _mm_testz_pd(__m128d __a, __m128d __b)
   2576 {
   2577   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
   2578 }
   2579 
   2580 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
   2581 ///    element-by-element comparison of the double-precision element in the
   2582 ///    first source vector and the corresponding element in the second source
   2583 ///    vector.
   2584 ///
   2585 ///    The EFLAGS register is updated as follows: \n
   2586 ///    If there is at least one pair of double-precision elements where the
   2587 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2588 ///    ZF flag is set to 1. \n
   2589 ///    If there is at least one pair of double-precision elements where the
   2590 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2591 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2592 ///    This intrinsic returns the value of the CF flag.
   2593 ///
   2594 /// \headerfile <x86intrin.h>
   2595 ///
   2596 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2597 ///
   2598 /// \param __a
   2599 ///    A 128-bit vector of [2 x double].
   2600 /// \param __b
   2601 ///    A 128-bit vector of [2 x double].
   2602 /// \returns the CF flag in the EFLAGS register.
   2603 static __inline int __DEFAULT_FN_ATTRS
   2604 _mm_testc_pd(__m128d __a, __m128d __b)
   2605 {
   2606   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
   2607 }
   2608 
   2609 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
   2610 ///    element-by-element comparison of the double-precision element in the
   2611 ///    first source vector and the corresponding element in the second source
   2612 ///    vector.
   2613 ///
   2614 ///    The EFLAGS register is updated as follows: \n
   2615 ///    If there is at least one pair of double-precision elements where the
   2616 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2617 ///    ZF flag is set to 1. \n
   2618 ///    If there is at least one pair of double-precision elements where the
   2619 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2620 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2621 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2622 ///    otherwise it returns 0.
   2623 ///
   2624 /// \headerfile <x86intrin.h>
   2625 ///
   2626 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2627 ///
   2628 /// \param __a
   2629 ///    A 128-bit vector of [2 x double].
   2630 /// \param __b
   2631 ///    A 128-bit vector of [2 x double].
   2632 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2633 static __inline int __DEFAULT_FN_ATTRS
   2634 _mm_testnzc_pd(__m128d __a, __m128d __b)
   2635 {
   2636   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
   2637 }
   2638 
   2639 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
   2640 ///    element-by-element comparison of the single-precision element in the
   2641 ///    first source vector and the corresponding element in the second source
   2642 ///    vector.
   2643 ///
   2644 ///    The EFLAGS register is updated as follows: \n
   2645 ///    If there is at least one pair of single-precision elements where the
   2646 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2647 ///    ZF flag is set to 1. \n
   2648 ///    If there is at least one pair of single-precision elements where the
   2649 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2650 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2651 ///    This intrinsic returns the value of the ZF flag.
   2652 ///
   2653 /// \headerfile <x86intrin.h>
   2654 ///
   2655 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2656 ///
   2657 /// \param __a
   2658 ///    A 128-bit vector of [4 x float].
   2659 /// \param __b
   2660 ///    A 128-bit vector of [4 x float].
   2661 /// \returns the ZF flag.
   2662 static __inline int __DEFAULT_FN_ATTRS
   2663 _mm_testz_ps(__m128 __a, __m128 __b)
   2664 {
   2665   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
   2666 }
   2667 
   2668 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
   2669 ///    element-by-element comparison of the single-precision element in the
   2670 ///    first source vector and the corresponding element in the second source
   2671 ///    vector.
   2672 ///
   2673 ///    The EFLAGS register is updated as follows: \n
   2674 ///    If there is at least one pair of single-precision elements where the
   2675 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2676 ///    ZF flag is set to 1. \n
   2677 ///    If there is at least one pair of single-precision elements where the
   2678 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2679 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2680 ///    This intrinsic returns the value of the CF flag.
   2681 ///
   2682 /// \headerfile <x86intrin.h>
   2683 ///
   2684 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2685 ///
   2686 /// \param __a
   2687 ///    A 128-bit vector of [4 x float].
   2688 /// \param __b
   2689 ///    A 128-bit vector of [4 x float].
   2690 /// \returns the CF flag.
   2691 static __inline int __DEFAULT_FN_ATTRS
   2692 _mm_testc_ps(__m128 __a, __m128 __b)
   2693 {
   2694   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
   2695 }
   2696 
   2697 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
   2698 ///    element-by-element comparison of the single-precision element in the
   2699 ///    first source vector and the corresponding element in the second source
   2700 ///    vector.
   2701 ///
   2702 ///    The EFLAGS register is updated as follows: \n
   2703 ///    If there is at least one pair of single-precision elements where the
   2704 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2705 ///    ZF flag is set to 1. \n
   2706 ///    If there is at least one pair of single-precision elements where the
   2707 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2708 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2709 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2710 ///    otherwise it returns 0.
   2711 ///
   2712 /// \headerfile <x86intrin.h>
   2713 ///
   2714 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2715 ///
   2716 /// \param __a
   2717 ///    A 128-bit vector of [4 x float].
   2718 /// \param __b
   2719 ///    A 128-bit vector of [4 x float].
   2720 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2721 static __inline int __DEFAULT_FN_ATTRS
   2722 _mm_testnzc_ps(__m128 __a, __m128 __b)
   2723 {
   2724   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
   2725 }
   2726 
   2727 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
   2728 ///    element-by-element comparison of the double-precision elements in the
   2729 ///    first source vector and the corresponding elements in the second source
   2730 ///    vector.
   2731 ///
   2732 ///    The EFLAGS register is updated as follows: \n
   2733 ///    If there is at least one pair of double-precision elements where the
   2734 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2735 ///    ZF flag is set to 1. \n
   2736 ///    If there is at least one pair of double-precision elements where the
   2737 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2738 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2739 ///    This intrinsic returns the value of the ZF flag.
   2740 ///
   2741 /// \headerfile <x86intrin.h>
   2742 ///
   2743 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2744 ///
   2745 /// \param __a
   2746 ///    A 256-bit vector of [4 x double].
   2747 /// \param __b
   2748 ///    A 256-bit vector of [4 x double].
   2749 /// \returns the ZF flag.
   2750 static __inline int __DEFAULT_FN_ATTRS
   2751 _mm256_testz_pd(__m256d __a, __m256d __b)
   2752 {
   2753   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
   2754 }
   2755 
   2756 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
   2757 ///    element-by-element comparison of the double-precision elements in the
   2758 ///    first source vector and the corresponding elements in the second source
   2759 ///    vector.
   2760 ///
   2761 ///    The EFLAGS register is updated as follows: \n
   2762 ///    If there is at least one pair of double-precision elements where the
   2763 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2764 ///    ZF flag is set to 1. \n
   2765 ///    If there is at least one pair of double-precision elements where the
   2766 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2767 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2768 ///    This intrinsic returns the value of the CF flag.
   2769 ///
   2770 /// \headerfile <x86intrin.h>
   2771 ///
   2772 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2773 ///
   2774 /// \param __a
   2775 ///    A 256-bit vector of [4 x double].
   2776 /// \param __b
   2777 ///    A 256-bit vector of [4 x double].
   2778 /// \returns the CF flag.
   2779 static __inline int __DEFAULT_FN_ATTRS
   2780 _mm256_testc_pd(__m256d __a, __m256d __b)
   2781 {
   2782   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
   2783 }
   2784 
   2785 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
   2786 ///    element-by-element comparison of the double-precision elements in the
   2787 ///    first source vector and the corresponding elements in the second source
   2788 ///    vector.
   2789 ///
   2790 ///    The EFLAGS register is updated as follows: \n
   2791 ///    If there is at least one pair of double-precision elements where the
   2792 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2793 ///    ZF flag is set to 1. \n
   2794 ///    If there is at least one pair of double-precision elements where the
   2795 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2796 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2797 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2798 ///    otherwise it returns 0.
   2799 ///
   2800 /// \headerfile <x86intrin.h>
   2801 ///
   2802 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2803 ///
   2804 /// \param __a
   2805 ///    A 256-bit vector of [4 x double].
   2806 /// \param __b
   2807 ///    A 256-bit vector of [4 x double].
   2808 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2809 static __inline int __DEFAULT_FN_ATTRS
   2810 _mm256_testnzc_pd(__m256d __a, __m256d __b)
   2811 {
   2812   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
   2813 }
   2814 
   2815 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
   2816 ///    element-by-element comparison of the single-precision element in the
   2817 ///    first source vector and the corresponding element in the second source
   2818 ///    vector.
   2819 ///
   2820 ///    The EFLAGS register is updated as follows: \n
   2821 ///    If there is at least one pair of single-precision elements where the
   2822 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2823 ///    ZF flag is set to 1. \n
   2824 ///    If there is at least one pair of single-precision elements where the
   2825 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2826 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2827 ///    This intrinsic returns the value of the ZF flag.
   2828 ///
   2829 /// \headerfile <x86intrin.h>
   2830 ///
   2831 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2832 ///
   2833 /// \param __a
   2834 ///    A 256-bit vector of [8 x float].
   2835 /// \param __b
   2836 ///    A 256-bit vector of [8 x float].
   2837 /// \returns the ZF flag.
   2838 static __inline int __DEFAULT_FN_ATTRS
   2839 _mm256_testz_ps(__m256 __a, __m256 __b)
   2840 {
   2841   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
   2842 }
   2843 
   2844 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
   2845 ///    element-by-element comparison of the single-precision element in the
   2846 ///    first source vector and the corresponding element in the second source
   2847 ///    vector.
   2848 ///
   2849 ///    The EFLAGS register is updated as follows: \n
   2850 ///    If there is at least one pair of single-precision elements where the
   2851 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2852 ///    ZF flag is set to 1. \n
   2853 ///    If there is at least one pair of single-precision elements where the
   2854 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2855 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2856 ///    This intrinsic returns the value of the CF flag.
   2857 ///
   2858 /// \headerfile <x86intrin.h>
   2859 ///
   2860 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2861 ///
   2862 /// \param __a
   2863 ///    A 256-bit vector of [8 x float].
   2864 /// \param __b
   2865 ///    A 256-bit vector of [8 x float].
   2866 /// \returns the CF flag.
   2867 static __inline int __DEFAULT_FN_ATTRS
   2868 _mm256_testc_ps(__m256 __a, __m256 __b)
   2869 {
   2870   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
   2871 }
   2872 
   2873 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
   2874 ///    element-by-element comparison of the single-precision elements in the
   2875 ///    first source vector and the corresponding elements in the second source
   2876 ///    vector.
   2877 ///
   2878 ///    The EFLAGS register is updated as follows: \n
   2879 ///    If there is at least one pair of single-precision elements where the
   2880 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2881 ///    ZF flag is set to 1. \n
   2882 ///    If there is at least one pair of single-precision elements where the
   2883 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2884 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2885 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2886 ///    otherwise it returns 0.
   2887 ///
   2888 /// \headerfile <x86intrin.h>
   2889 ///
   2890 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2891 ///
   2892 /// \param __a
   2893 ///    A 256-bit vector of [8 x float].
   2894 /// \param __b
   2895 ///    A 256-bit vector of [8 x float].
   2896 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2897 static __inline int __DEFAULT_FN_ATTRS
   2898 _mm256_testnzc_ps(__m256 __a, __m256 __b)
   2899 {
   2900   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
   2901 }
   2902 
   2903 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
   2904 ///    of the two source vectors.
   2905 ///
   2906 ///    The EFLAGS register is updated as follows: \n
   2907 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
   2908 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
   2909 ///    If there is at least one pair of bits where the bit from the first source
   2910 ///    vector is 0 and the bit from the second source vector is 1, the CF flag
   2911 ///    is set to 0. Otherwise the CF flag is set to 1. \n
   2912 ///    This intrinsic returns the value of the ZF flag.
   2913 ///
   2914 /// \headerfile <x86intrin.h>
   2915 ///
   2916 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
   2917 ///
   2918 /// \param __a
   2919 ///    A 256-bit integer vector.
   2920 /// \param __b
   2921 ///    A 256-bit integer vector.
   2922 /// \returns the ZF flag.
   2923 static __inline int __DEFAULT_FN_ATTRS
   2924 _mm256_testz_si256(__m256i __a, __m256i __b)
   2925 {
   2926   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
   2927 }
   2928 
   2929 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
   2930 ///    of the two source vectors.
   2931 ///
   2932 ///    The EFLAGS register is updated as follows: \n
   2933 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
   2934 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
   2935 ///    If there is at least one pair of bits where the bit from the first source
   2936 ///    vector is 0 and the bit from the second source vector is 1, the CF flag
   2937 ///    is set to 0. Otherwise the CF flag is set to 1. \n
   2938 ///    This intrinsic returns the value of the CF flag.
   2939 ///
   2940 /// \headerfile <x86intrin.h>
   2941 ///
   2942 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
   2943 ///
   2944 /// \param __a
   2945 ///    A 256-bit integer vector.
   2946 /// \param __b
   2947 ///    A 256-bit integer vector.
   2948 /// \returns the CF flag.
   2949 static __inline int __DEFAULT_FN_ATTRS
   2950 _mm256_testc_si256(__m256i __a, __m256i __b)
   2951 {
   2952   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
   2953 }
   2954 
   2955 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
   2956 ///    of the two source vectors.
   2957 ///
   2958 ///    The EFLAGS register is updated as follows: \n
   2959 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
   2960 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
   2961 ///    If there is at least one pair of bits where the bit from the first source
   2962 ///    vector is 0 and the bit from the second source vector is 1, the CF flag
   2963 ///    is set to 0. Otherwise the CF flag is set to 1. \n
   2964 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2965 ///    otherwise it returns 0.
   2966 ///
   2967 /// \headerfile <x86intrin.h>
   2968 ///
   2969 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
   2970 ///
   2971 /// \param __a
   2972 ///    A 256-bit integer vector.
   2973 /// \param __b
   2974 ///    A 256-bit integer vector.
   2975 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2976 static __inline int __DEFAULT_FN_ATTRS
   2977 _mm256_testnzc_si256(__m256i __a, __m256i __b)
   2978 {
   2979   return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
   2980 }
   2981 
   2982 /* Vector extract sign mask */
   2983 /// \brief Extracts the sign bits of double-precision floating point elements
   2984 ///    in a 256-bit vector of [4 x double] and writes them to the lower order
   2985 ///    bits of the return value.
   2986 ///
   2987 /// \headerfile <x86intrin.h>
   2988 ///
   2989 /// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
   2990 ///
   2991 /// \param __a
   2992 ///    A 256-bit vector of [4 x double] containing the double-precision
   2993 ///    floating point values with sign bits to be extracted.
   2994 /// \returns The sign bits from the operand, written to bits [3:0].
   2995 static __inline int __DEFAULT_FN_ATTRS
   2996 _mm256_movemask_pd(__m256d __a)
   2997 {
   2998   return __builtin_ia32_movmskpd256((__v4df)__a);
   2999 }
   3000 
   3001 /// \brief Extracts the sign bits of double-precision floating point elements
   3002 ///    in a 256-bit vector of [8 x float] and writes them to the lower order
   3003 ///    bits of the return value.
   3004 ///
   3005 /// \headerfile <x86intrin.h>
   3006 ///
   3007 /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
   3008 ///
   3009 /// \param __a
   3010 ///    A 256-bit vector of [8 x float] containing the double-precision floating
   3011 ///    point values with sign bits to be extracted.
   3012 /// \returns The sign bits from the operand, written to bits [7:0].
   3013 static __inline int __DEFAULT_FN_ATTRS
   3014 _mm256_movemask_ps(__m256 __a)
   3015 {
   3016   return __builtin_ia32_movmskps256((__v8sf)__a);
   3017 }
   3018 
   3019 /* Vector __zero */
   3020 /// \brief Zeroes the contents of all XMM or YMM registers.
   3021 ///
   3022 /// \headerfile <x86intrin.h>
   3023 ///
   3024 /// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
   3025 static __inline void __DEFAULT_FN_ATTRS
   3026 _mm256_zeroall(void)
   3027 {
   3028   __builtin_ia32_vzeroall();
   3029 }
   3030 
   3031 /// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
   3032 ///
   3033 /// \headerfile <x86intrin.h>
   3034 ///
   3035 /// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
   3036 static __inline void __DEFAULT_FN_ATTRS
   3037 _mm256_zeroupper(void)
   3038 {
   3039   __builtin_ia32_vzeroupper();
   3040 }
   3041 
   3042 /* Vector load with broadcast */
   3043 /// \brief Loads a scalar single-precision floating point value from the
   3044 ///    specified address pointed to by \a __a and broadcasts it to the elements
   3045 ///    of a [4 x float] vector.
   3046 ///
   3047 /// \headerfile <x86intrin.h>
   3048 ///
   3049 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
   3050 ///
   3051 /// \param __a
   3052 ///    The single-precision floating point value to be broadcast.
   3053 /// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
   3054 ///    equal to the broadcast value.
   3055 static __inline __m128 __DEFAULT_FN_ATTRS
   3056 _mm_broadcast_ss(float const *__a)
   3057 {
   3058   float __f = *__a;
   3059   return (__m128)(__v4sf){ __f, __f, __f, __f };
   3060 }
   3061 
   3062 /// \brief Loads a scalar double-precision floating point value from the
   3063 ///    specified address pointed to by \a __a and broadcasts it to the elements
   3064 ///    of a [4 x double] vector.
   3065 ///
   3066 /// \headerfile <x86intrin.h>
   3067 ///
   3068 /// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
   3069 ///
   3070 /// \param __a
   3071 ///    The double-precision floating point value to be broadcast.
   3072 /// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
   3073 ///    equal to the broadcast value.
   3074 static __inline __m256d __DEFAULT_FN_ATTRS
   3075 _mm256_broadcast_sd(double const *__a)
   3076 {
   3077   double __d = *__a;
   3078   return (__m256d)(__v4df){ __d, __d, __d, __d };
   3079 }
   3080 
   3081 /// \brief Loads a scalar single-precision floating point value from the
   3082 ///    specified address pointed to by \a __a and broadcasts it to the elements
   3083 ///    of a [8 x float] vector.
   3084 ///
   3085 /// \headerfile <x86intrin.h>
   3086 ///
   3087 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
   3088 ///
   3089 /// \param __a
   3090 ///    The single-precision floating point value to be broadcast.
   3091 /// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
   3092 ///    equal to the broadcast value.
   3093 static __inline __m256 __DEFAULT_FN_ATTRS
   3094 _mm256_broadcast_ss(float const *__a)
   3095 {
   3096   float __f = *__a;
   3097   return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
   3098 }
   3099 
   3100 /// \brief Loads the data from a 128-bit vector of [2 x double] from the
   3101 ///    specified address pointed to by \a __a and broadcasts it to 128-bit
   3102 ///    elements in a 256-bit vector of [4 x double].
   3103 ///
   3104 /// \headerfile <x86intrin.h>
   3105 ///
   3106 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
   3107 ///
   3108 /// \param __a
   3109 ///    The 128-bit vector of [2 x double] to be broadcast.
   3110 /// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
   3111 ///    equal to the broadcast value.
   3112 static __inline __m256d __DEFAULT_FN_ATTRS
   3113 _mm256_broadcast_pd(__m128d const *__a)
   3114 {
   3115   return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
   3116 }
   3117 
   3118 /// \brief Loads the data from a 128-bit vector of [4 x float] from the
   3119 ///    specified address pointed to by \a __a and broadcasts it to 128-bit
   3120 ///    elements in a 256-bit vector of [8 x float].
   3121 ///
   3122 /// \headerfile <x86intrin.h>
   3123 ///
   3124 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
   3125 ///
   3126 /// \param __a
   3127 ///    The 128-bit vector of [4 x float] to be broadcast.
   3128 /// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
   3129 ///    equal to the broadcast value.
   3130 static __inline __m256 __DEFAULT_FN_ATTRS
   3131 _mm256_broadcast_ps(__m128 const *__a)
   3132 {
   3133   return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
   3134 }
   3135 
   3136 /* SIMD load ops */
   3137 /// \brief Loads 4 double-precision floating point values from a 32-byte aligned
   3138 ///    memory location pointed to by \a __p into a vector of [4 x double].
   3139 ///
   3140 /// \headerfile <x86intrin.h>
   3141 ///
   3142 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
   3143 ///
   3144 /// \param __p
   3145 ///    A 32-byte aligned pointer to a memory location containing
   3146 ///    double-precision floating point values.
   3147 /// \returns A 256-bit vector of [4 x double] containing the moved values.
   3148 static __inline __m256d __DEFAULT_FN_ATTRS
   3149 _mm256_load_pd(double const *__p)
   3150 {
   3151   return *(__m256d *)__p;
   3152 }
   3153 
   3154 /// \brief Loads 8 single-precision floating point values from a 32-byte aligned
   3155 ///    memory location pointed to by \a __p into a vector of [8 x float].
   3156 ///
   3157 /// \headerfile <x86intrin.h>
   3158 ///
   3159 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
   3160 ///
   3161 /// \param __p
   3162 ///    A 32-byte aligned pointer to a memory location containing float values.
   3163 /// \returns A 256-bit vector of [8 x float] containing the moved values.
   3164 static __inline __m256 __DEFAULT_FN_ATTRS
   3165 _mm256_load_ps(float const *__p)
   3166 {
   3167   return *(__m256 *)__p;
   3168 }
   3169 
   3170 /// \brief Loads 4 double-precision floating point values from an unaligned
   3171 ///    memory location pointed to by \a __p into a vector of [4 x double].
   3172 ///
   3173 /// \headerfile <x86intrin.h>
   3174 ///
   3175 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
   3176 ///
   3177 /// \param __p
   3178 ///    A pointer to a memory location containing double-precision floating
   3179 ///    point values.
   3180 /// \returns A 256-bit vector of [4 x double] containing the moved values.
   3181 static __inline __m256d __DEFAULT_FN_ATTRS
   3182 _mm256_loadu_pd(double const *__p)
   3183 {
   3184   struct __loadu_pd {
   3185     __m256d __v;
   3186   } __attribute__((__packed__, __may_alias__));
   3187   return ((struct __loadu_pd*)__p)->__v;
   3188 }
   3189 
   3190 /// \brief Loads 8 single-precision floating point values from an unaligned
   3191 ///    memory location pointed to by \a __p into a vector of [8 x float].
   3192 ///
   3193 /// \headerfile <x86intrin.h>
   3194 ///
   3195 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
   3196 ///
   3197 /// \param __p
   3198 ///    A pointer to a memory location containing single-precision floating
   3199 ///    point values.
   3200 /// \returns A 256-bit vector of [8 x float] containing the moved values.
   3201 static __inline __m256 __DEFAULT_FN_ATTRS
   3202 _mm256_loadu_ps(float const *__p)
   3203 {
   3204   struct __loadu_ps {
   3205     __m256 __v;
   3206   } __attribute__((__packed__, __may_alias__));
   3207   return ((struct __loadu_ps*)__p)->__v;
   3208 }
   3209 
   3210 /// \brief Loads 256 bits of integer data from a 32-byte aligned memory
   3211 ///    location pointed to by \a __p into elements of a 256-bit integer vector.
   3212 ///
   3213 /// \headerfile <x86intrin.h>
   3214 ///
   3215 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
   3216 ///
   3217 /// \param __p
   3218 ///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
   3219 ///    values.
   3220 /// \returns A 256-bit integer vector containing the moved values.
   3221 static __inline __m256i __DEFAULT_FN_ATTRS
   3222 _mm256_load_si256(__m256i const *__p)
   3223 {
   3224   return *__p;
   3225 }
   3226 
   3227 /// \brief Loads 256 bits of integer data from an unaligned memory location
   3228 ///    pointed to by \a __p into a 256-bit integer vector.
   3229 ///
   3230 /// \headerfile <x86intrin.h>
   3231 ///
   3232 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
   3233 ///
   3234 /// \param __p
   3235 ///    A pointer to a 256-bit integer vector containing integer values.
   3236 /// \returns A 256-bit integer vector containing the moved values.
   3237 static __inline __m256i __DEFAULT_FN_ATTRS
   3238 _mm256_loadu_si256(__m256i const *__p)
   3239 {
   3240   struct __loadu_si256 {
   3241     __m256i __v;
   3242   } __attribute__((__packed__, __may_alias__));
   3243   return ((struct __loadu_si256*)__p)->__v;
   3244 }
   3245 
   3246 /// \brief Loads 256 bits of integer data from an unaligned memory location
   3247 ///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
   3248 ///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
   3249 ///    line boundary.
   3250 ///
   3251 /// \headerfile <x86intrin.h>
   3252 ///
   3253 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
   3254 ///
   3255 /// \param __p
   3256 ///    A pointer to a 256-bit integer vector containing integer values.
   3257 /// \returns A 256-bit integer vector containing the moved values.
   3258 static __inline __m256i __DEFAULT_FN_ATTRS
   3259 _mm256_lddqu_si256(__m256i const *__p)
   3260 {
   3261   return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
   3262 }
   3263 
   3264 /* SIMD store ops */
   3265 /// \brief Stores double-precision floating point values from a 256-bit vector
   3266 ///    of [4 x double] to a 32-byte aligned memory location pointed to by
   3267 ///    \a __p.
   3268 ///
   3269 /// \headerfile <x86intrin.h>
   3270 ///
   3271 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
   3272 ///
   3273 /// \param __p
   3274 ///    A 32-byte aligned pointer to a memory location that will receive the
   3275 ///    double-precision floaing point values.
   3276 /// \param __a
   3277 ///    A 256-bit vector of [4 x double] containing the values to be moved.
   3278 static __inline void __DEFAULT_FN_ATTRS
   3279 _mm256_store_pd(double *__p, __m256d __a)
   3280 {
   3281   *(__m256d *)__p = __a;
   3282 }
   3283 
   3284 /// \brief Stores single-precision floating point values from a 256-bit vector
   3285 ///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
   3286 ///
   3287 /// \headerfile <x86intrin.h>
   3288 ///
   3289 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
   3290 ///
   3291 /// \param __p
   3292 ///    A 32-byte aligned pointer to a memory location that will receive the
   3293 ///    float values.
   3294 /// \param __a
   3295 ///    A 256-bit vector of [8 x float] containing the values to be moved.
   3296 static __inline void __DEFAULT_FN_ATTRS
   3297 _mm256_store_ps(float *__p, __m256 __a)
   3298 {
   3299   *(__m256 *)__p = __a;
   3300 }
   3301 
   3302 /// \brief Stores double-precision floating point values from a 256-bit vector
   3303 ///    of [4 x double] to an unaligned memory location pointed to by \a __p.
   3304 ///
   3305 /// \headerfile <x86intrin.h>
   3306 ///
   3307 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
   3308 ///
   3309 /// \param __p
   3310 ///    A pointer to a memory location that will receive the double-precision
   3311 ///    floating point values.
   3312 /// \param __a
   3313 ///    A 256-bit vector of [4 x double] containing the values to be moved.
   3314 static __inline void __DEFAULT_FN_ATTRS
   3315 _mm256_storeu_pd(double *__p, __m256d __a)
   3316 {
   3317   struct __storeu_pd {
   3318     __m256d __v;
   3319   } __attribute__((__packed__, __may_alias__));
   3320   ((struct __storeu_pd*)__p)->__v = __a;
   3321 }
   3322 
   3323 /// \brief Stores single-precision floating point values from a 256-bit vector
   3324 ///    of [8 x float] to an unaligned memory location pointed to by \a __p.
   3325 ///
   3326 /// \headerfile <x86intrin.h>
   3327 ///
   3328 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
   3329 ///
   3330 /// \param __p
   3331 ///    A pointer to a memory location that will receive the float values.
   3332 /// \param __a
   3333 ///    A 256-bit vector of [8 x float] containing the values to be moved.
   3334 static __inline void __DEFAULT_FN_ATTRS
   3335 _mm256_storeu_ps(float *__p, __m256 __a)
   3336 {
   3337   struct __storeu_ps {
   3338     __m256 __v;
   3339   } __attribute__((__packed__, __may_alias__));
   3340   ((struct __storeu_ps*)__p)->__v = __a;
   3341 }
   3342 
   3343 /// \brief Stores integer values from a 256-bit integer vector to a 32-byte
   3344 ///    aligned memory location pointed to by \a __p.
   3345 ///
   3346 /// \headerfile <x86intrin.h>
   3347 ///
   3348 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
   3349 ///
   3350 /// \param __p
   3351 ///    A 32-byte aligned pointer to a memory location that will receive the
   3352 ///    integer values.
   3353 /// \param __a
   3354 ///    A 256-bit integer vector containing the values to be moved.
   3355 static __inline void __DEFAULT_FN_ATTRS
   3356 _mm256_store_si256(__m256i *__p, __m256i __a)
   3357 {
   3358   *__p = __a;
   3359 }
   3360 
   3361 /// \brief Stores integer values from a 256-bit integer vector to an unaligned
   3362 ///    memory location pointed to by \a __p.
   3363 ///
   3364 /// \headerfile <x86intrin.h>
   3365 ///
   3366 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
   3367 ///
   3368 /// \param __p
   3369 ///    A pointer to a memory location that will receive the integer values.
   3370 /// \param __a
   3371 ///    A 256-bit integer vector containing the values to be moved.
   3372 static __inline void __DEFAULT_FN_ATTRS
   3373 _mm256_storeu_si256(__m256i *__p, __m256i __a)
   3374 {
   3375   struct __storeu_si256 {
   3376     __m256i __v;
   3377   } __attribute__((__packed__, __may_alias__));
   3378   ((struct __storeu_si256*)__p)->__v = __a;
   3379 }
   3380 
   3381 /* Conditional load ops */
   3382 /// \brief Conditionally loads double-precision floating point elements from a
   3383 ///    memory location pointed to by \a __p into a 128-bit vector of
   3384 ///    [2 x double], depending on the mask bits associated with each data
   3385 ///    element.
   3386 ///
   3387 /// \headerfile <x86intrin.h>
   3388 ///
   3389 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3390 ///
   3391 /// \param __p
   3392 ///    A pointer to a memory location that contains the double-precision
   3393 ///    floating point values.
   3394 /// \param __m
   3395 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3396 ///    each data element represents the mask bits. If a mask bit is zero, the
   3397 ///    corresponding value in the memory location is not loaded and the
   3398 ///    corresponding field in the return value is set to zero.
   3399 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
   3400 static __inline __m128d __DEFAULT_FN_ATTRS
   3401 _mm_maskload_pd(double const *__p, __m128i __m)
   3402 {
   3403   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
   3404 }
   3405 
   3406 /// \brief Conditionally loads double-precision floating point elements from a
   3407 ///    memory location pointed to by \a __p into a 256-bit vector of
   3408 ///    [4 x double], depending on the mask bits associated with each data
   3409 ///    element.
   3410 ///
   3411 /// \headerfile <x86intrin.h>
   3412 ///
   3413 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3414 ///
   3415 /// \param __p
   3416 ///    A pointer to a memory location that contains the double-precision
   3417 ///    floating point values.
   3418 /// \param __m
   3419 ///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
   3420 ///    significant bit of each quadword element represents the mask bits. If a
   3421 ///    mask bit is zero, the corresponding value in the memory location is not
   3422 ///    loaded and the corresponding field in the return value is set to zero.
   3423 /// \returns A 256-bit vector of [4 x double] containing the loaded values.
   3424 static __inline __m256d __DEFAULT_FN_ATTRS
   3425 _mm256_maskload_pd(double const *__p, __m256i __m)
   3426 {
   3427   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
   3428                                                (__v4di)__m);
   3429 }
   3430 
   3431 /// \brief Conditionally loads single-precision floating point elements from a
   3432 ///    memory location pointed to by \a __p into a 128-bit vector of
   3433 ///    [4 x float], depending on the mask bits associated with each data
   3434 ///    element.
   3435 ///
   3436 /// \headerfile <x86intrin.h>
   3437 ///
   3438 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3439 ///
   3440 /// \param __p
   3441 ///    A pointer to a memory location that contains the single-precision
   3442 ///    floating point values.
   3443 /// \param __m
   3444 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3445 ///    each data element represents the mask bits. If a mask bit is zero, the
   3446 ///    corresponding value in the memory location is not loaded and the
   3447 ///    corresponding field in the return value is set to zero.
   3448 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
   3449 static __inline __m128 __DEFAULT_FN_ATTRS
   3450 _mm_maskload_ps(float const *__p, __m128i __m)
   3451 {
   3452   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
   3453 }
   3454 
   3455 /// \brief Conditionally loads single-precision floating point elements from a
   3456 ///    memory location pointed to by \a __p into a 256-bit vector of
   3457 ///    [8 x float], depending on the mask bits associated with each data
   3458 ///    element.
   3459 ///
   3460 /// \headerfile <x86intrin.h>
   3461 ///
   3462 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3463 ///
   3464 /// \param __p
   3465 ///    A pointer to a memory location that contains the single-precision
   3466 ///    floating point values.
   3467 /// \param __m
   3468 ///    A 256-bit integer vector of [8 x dword] containing the mask. The most
   3469 ///    significant bit of each dword element represents the mask bits. If a mask
   3470 ///    bit is zero, the corresponding value in the memory location is not loaded
   3471 ///    and the corresponding field in the return value is set to zero.
   3472 /// \returns A 256-bit vector of [8 x float] containing the loaded values.
   3473 static __inline __m256 __DEFAULT_FN_ATTRS
   3474 _mm256_maskload_ps(float const *__p, __m256i __m)
   3475 {
   3476   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
   3477 }
   3478 
   3479 /* Conditional store ops */
   3480 /// \brief Moves single-precision floating point values from a 256-bit vector
   3481 ///    of [8 x float] to a memory location pointed to by \a __p, according to
   3482 ///    the specified mask.
   3483 ///
   3484 /// \headerfile <x86intrin.h>
   3485 ///
   3486 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3487 ///
   3488 /// \param __p
   3489 ///    A pointer to a memory location that will receive the float values.
   3490 /// \param __m
   3491 ///    A 256-bit integer vector of [8 x dword] containing the mask. The most
   3492 ///    significant bit of each dword element in the mask vector represents the
   3493 ///    mask bits. If a mask bit is zero, the corresponding value from vector
   3494 ///    \a __a is not stored and the corresponding field in the memory location
   3495 ///    pointed to by \a __p is not changed.
   3496 /// \param __a
   3497 ///    A 256-bit vector of [8 x float] containing the values to be stored.
   3498 static __inline void __DEFAULT_FN_ATTRS
   3499 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
   3500 {
   3501   __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
   3502 }
   3503 
   3504 /// \brief Moves double-precision values from a 128-bit vector of [2 x double]
   3505 ///    to a memory location pointed to by \a __p, according to the specified
   3506 ///    mask.
   3507 ///
   3508 /// \headerfile <x86intrin.h>
   3509 ///
   3510 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3511 ///
   3512 /// \param __p
   3513 ///    A pointer to a memory location that will receive the float values.
   3514 /// \param __m
   3515 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3516 ///    each field in the mask vector represents the mask bits. If a mask bit is
   3517 ///    zero, the corresponding value from vector \a __a is not stored and the
   3518 ///    corresponding field in the memory location pointed to by \a __p is not
   3519 ///    changed.
   3520 /// \param __a
   3521 ///    A 128-bit vector of [2 x double] containing the values to be stored.
   3522 static __inline void __DEFAULT_FN_ATTRS
   3523 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
   3524 {
   3525   __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
   3526 }
   3527 
   3528 /// \brief Moves double-precision values from a 256-bit vector of [4 x double]
   3529 ///    to a memory location pointed to by \a __p, according to the specified
   3530 ///    mask.
   3531 ///
   3532 /// \headerfile <x86intrin.h>
   3533 ///
   3534 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3535 ///
   3536 /// \param __p
   3537 ///    A pointer to a memory location that will receive the float values.
   3538 /// \param __m
   3539 ///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
   3540 ///    significant bit of each quadword element in the mask vector represents
   3541 ///    the mask bits. If a mask bit is zero, the corresponding value from vector
   3542 ///    __a is not stored and the corresponding field in the memory location
   3543 ///    pointed to by \a __p is not changed.
   3544 /// \param __a
   3545 ///    A 256-bit vector of [4 x double] containing the values to be stored.
   3546 static __inline void __DEFAULT_FN_ATTRS
   3547 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
   3548 {
   3549   __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
   3550 }
   3551 
   3552 /// \brief Moves single-precision floating point values from a 128-bit vector
   3553 ///    of [4 x float] to a memory location pointed to by \a __p, according to
   3554 ///    the specified mask.
   3555 ///
   3556 /// \headerfile <x86intrin.h>
   3557 ///
   3558 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3559 ///
   3560 /// \param __p
   3561 ///    A pointer to a memory location that will receive the float values.
   3562 /// \param __m
   3563 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3564 ///    each field in the mask vector represents the mask bits. If a mask bit is
   3565 ///    zero, the corresponding value from vector __a is not stored and the
   3566 ///    corresponding field in the memory location pointed to by \a __p is not
   3567 ///    changed.
   3568 /// \param __a
   3569 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   3570 static __inline void __DEFAULT_FN_ATTRS
   3571 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
   3572 {
   3573   __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
   3574 }
   3575 
   3576 /* Cacheability support ops */
   3577 /// \brief Moves integer data from a 256-bit integer vector to a 32-byte
   3578 ///    aligned memory location. To minimize caching, the data is flagged as
   3579 ///    non-temporal (unlikely to be used again soon).
   3580 ///
   3581 /// \headerfile <x86intrin.h>
   3582 ///
   3583 /// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
   3584 ///
   3585 /// \param __a
   3586 ///    A pointer to a 32-byte aligned memory location that will receive the
   3587 ///    integer values.
   3588 /// \param __b
   3589 ///    A 256-bit integer vector containing the values to be moved.
   3590 static __inline void __DEFAULT_FN_ATTRS
   3591 _mm256_stream_si256(__m256i *__a, __m256i __b)
   3592 {
   3593   __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
   3594 }
   3595 
   3596 /// \brief Moves double-precision values from a 256-bit vector of [4 x double]
   3597 ///    to a 32-byte aligned memory location. To minimize caching, the data is
   3598 ///    flagged as non-temporal (unlikely to be used again soon).
   3599 ///
   3600 /// \headerfile <x86intrin.h>
   3601 ///
   3602 /// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
   3603 ///
   3604 /// \param __a
   3605 ///    A pointer to a 32-byte aligned memory location that will receive the
   3606 ///    double-precision floating-point values.
   3607 /// \param __b
   3608 ///    A 256-bit vector of [4 x double] containing the values to be moved.
   3609 static __inline void __DEFAULT_FN_ATTRS
   3610 _mm256_stream_pd(double *__a, __m256d __b)
   3611 {
   3612   __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
   3613 }
   3614 
   3615 /// \brief Moves single-precision floating point values from a 256-bit vector
   3616 ///    of [8 x float] to a 32-byte aligned memory location. To minimize
   3617 ///    caching, the data is flagged as non-temporal (unlikely to be used again
   3618 ///    soon).
   3619 ///
   3620 /// \headerfile <x86intrin.h>
   3621 ///
   3622 /// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
   3623 ///
   3624 /// \param __p
   3625 ///    A pointer to a 32-byte aligned memory location that will receive the
   3626 ///    single-precision floating point values.
   3627 /// \param __a
   3628 ///    A 256-bit vector of [8 x float] containing the values to be moved.
   3629 static __inline void __DEFAULT_FN_ATTRS
   3630 _mm256_stream_ps(float *__p, __m256 __a)
   3631 {
   3632   __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
   3633 }
   3634 
   3635 /* Create vectors */
   3636 /// \brief Create a 256-bit vector of [4 x double] with undefined values.
   3637 ///
   3638 /// \headerfile <x86intrin.h>
   3639 ///
   3640 /// This intrinsic has no corresponding instruction.
   3641 ///
   3642 /// \returns A 256-bit vector of [4 x double] containing undefined values.
   3643 static __inline__ __m256d __DEFAULT_FN_ATTRS
   3644 _mm256_undefined_pd(void)
   3645 {
   3646   return (__m256d)__builtin_ia32_undef256();
   3647 }
   3648 
   3649 /// \brief Create a 256-bit vector of [8 x float] with undefined values.
   3650 ///
   3651 /// \headerfile <x86intrin.h>
   3652 ///
   3653 /// This intrinsic has no corresponding instruction.
   3654 ///
   3655 /// \returns A 256-bit vector of [8 x float] containing undefined values.
   3656 static __inline__ __m256 __DEFAULT_FN_ATTRS
   3657 _mm256_undefined_ps(void)
   3658 {
   3659   return (__m256)__builtin_ia32_undef256();
   3660 }
   3661 
   3662 /// \brief Create a 256-bit integer vector with undefined values.
   3663 ///
   3664 /// \headerfile <x86intrin.h>
   3665 ///
   3666 /// This intrinsic has no corresponding instruction.
   3667 ///
   3668 /// \returns A 256-bit integer vector containing undefined values.
   3669 static __inline__ __m256i __DEFAULT_FN_ATTRS
   3670 _mm256_undefined_si256(void)
   3671 {
   3672   return (__m256i)__builtin_ia32_undef256();
   3673 }
   3674 
   3675 /// \brief Constructs a 256-bit floating-point vector of [4 x double]
   3676 ///    initialized with the specified double-precision floating-point values.
   3677 ///
   3678 /// \headerfile <x86intrin.h>
   3679 ///
   3680 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
   3681 ///   instruction.
   3682 ///
   3683 /// \param __a
   3684 ///    A double-precision floating-point value used to initialize bits [255:192]
   3685 ///    of the result.
   3686 /// \param __b
   3687 ///    A double-precision floating-point value used to initialize bits [191:128]
   3688 ///    of the result.
   3689 /// \param __c
   3690 ///    A double-precision floating-point value used to initialize bits [127:64]
   3691 ///    of the result.
   3692 /// \param __d
   3693 ///    A double-precision floating-point value used to initialize bits [63:0]
   3694 ///    of the result.
   3695 /// \returns An initialized 256-bit floating-point vector of [4 x double].
   3696 static __inline __m256d __DEFAULT_FN_ATTRS
   3697 _mm256_set_pd(double __a, double __b, double __c, double __d)
   3698 {
   3699   return (__m256d){ __d, __c, __b, __a };
   3700 }
   3701 
   3702 /// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
   3703 ///    with the specified single-precision floating-point values.
   3704 ///
   3705 /// \headerfile <x86intrin.h>
   3706 ///
   3707 /// This intrinsic is a utility function and does not correspond to a specific
   3708 ///   instruction.
   3709 ///
   3710 /// \param __a
   3711 ///    A single-precision floating-point value used to initialize bits [255:224]
   3712 ///    of the result.
   3713 /// \param __b
   3714 ///    A single-precision floating-point value used to initialize bits [223:192]
   3715 ///    of the result.
   3716 /// \param __c
   3717 ///    A single-precision floating-point value used to initialize bits [191:160]
   3718 ///    of the result.
   3719 /// \param __d
   3720 ///    A single-precision floating-point value used to initialize bits [159:128]
   3721 ///    of the result.
   3722 /// \param __e
   3723 ///    A single-precision floating-point value used to initialize bits [127:96]
   3724 ///    of the result.
   3725 /// \param __f
   3726 ///    A single-precision floating-point value used to initialize bits [95:64]
   3727 ///    of the result.
   3728 /// \param __g
   3729 ///    A single-precision floating-point value used to initialize bits [63:32]
   3730 ///    of the result.
   3731 /// \param __h
   3732 ///    A single-precision floating-point value used to initialize bits [31:0]
   3733 ///    of the result.
   3734 /// \returns An initialized 256-bit floating-point vector of [8 x float].
   3735 static __inline __m256 __DEFAULT_FN_ATTRS
   3736 _mm256_set_ps(float __a, float __b, float __c, float __d,
   3737               float __e, float __f, float __g, float __h)
   3738 {
   3739   return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
   3740 }
   3741 
   3742 /// \brief Constructs a 256-bit integer vector initialized with the specified
   3743 ///    32-bit integral values.
   3744 ///
   3745 /// \headerfile <x86intrin.h>
   3746 ///
   3747 /// This intrinsic is a utility function and does not correspond to a specific
   3748 ///   instruction.
   3749 ///
   3750 /// \param __i0
   3751 ///    A 32-bit integral value used to initialize bits [255:224] of the result.
   3752 /// \param __i1
   3753 ///    A 32-bit integral value used to initialize bits [223:192] of the result.
   3754 /// \param __i2
   3755 ///    A 32-bit integral value used to initialize bits [191:160] of the result.
   3756 /// \param __i3
   3757 ///    A 32-bit integral value used to initialize bits [159:128] of the result.
   3758 /// \param __i4
   3759 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
   3760 /// \param __i5
   3761 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
   3762 /// \param __i6
   3763 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
   3764 /// \param __i7
   3765 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
   3766 /// \returns An initialized 256-bit integer vector.
   3767 static __inline __m256i __DEFAULT_FN_ATTRS
   3768 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
   3769                  int __i4, int __i5, int __i6, int __i7)
   3770 {
   3771   return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
   3772 }
   3773 
   3774 /// \brief Constructs a 256-bit integer vector initialized with the specified
   3775 ///    16-bit integral values.
   3776 ///
   3777 /// \headerfile <x86intrin.h>
   3778 ///
   3779 /// This intrinsic is a utility function and does not correspond to a specific
   3780 ///   instruction.
   3781 ///
   3782 /// \param __w15
   3783 ///    A 16-bit integral value used to initialize bits [255:240] of the result.
   3784 /// \param __w14
   3785 ///    A 16-bit integral value used to initialize bits [239:224] of the result.
   3786 /// \param __w13
   3787 ///    A 16-bit integral value used to initialize bits [223:208] of the result.
   3788 /// \param __w12
   3789 ///    A 16-bit integral value used to initialize bits [207:192] of the result.
   3790 /// \param __w11
   3791 ///    A 16-bit integral value used to initialize bits [191:176] of the result.
   3792 /// \param __w10
   3793 ///    A 16-bit integral value used to initialize bits [175:160] of the result.
   3794 /// \param __w09
   3795 ///    A 16-bit integral value used to initialize bits [159:144] of the result.
   3796 /// \param __w08
   3797 ///    A 16-bit integral value used to initialize bits [143:128] of the result.
   3798 /// \param __w07
   3799 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
   3800 /// \param __w06
   3801 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
   3802 /// \param __w05
   3803 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
   3804 /// \param __w04
   3805 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
   3806 /// \param __w03
   3807 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
   3808 /// \param __w02
   3809 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
   3810 /// \param __w01
   3811 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
   3812 /// \param __w00
   3813 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
   3814 /// \returns An initialized 256-bit integer vector.
   3815 static __inline __m256i __DEFAULT_FN_ATTRS
   3816 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
   3817                  short __w11, short __w10, short __w09, short __w08,
   3818                  short __w07, short __w06, short __w05, short __w04,
   3819                  short __w03, short __w02, short __w01, short __w00)
   3820 {
   3821   return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
   3822     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
   3823 }
   3824 
   3825 /// \brief Constructs a 256-bit integer vector initialized with the specified
   3826 ///    8-bit integral values.
   3827 ///
   3828 /// \headerfile <x86intrin.h>
   3829 ///
   3830 /// This intrinsic is a utility function and does not correspond to a specific
   3831 ///   instruction.
   3832 ///
   3833 /// \param __b31
   3834 ///    An 8-bit integral value used to initialize bits [255:248] of the result.
   3835 /// \param __b30
   3836 ///    An 8-bit integral value used to initialize bits [247:240] of the result.
   3837 /// \param __b29
   3838 ///    An 8-bit integral value used to initialize bits [239:232] of the result.
   3839 /// \param __b28
   3840 ///    An 8-bit integral value used to initialize bits [231:224] of the result.
   3841 /// \param __b27
   3842 ///    An 8-bit integral value used to initialize bits [223:216] of the result.
   3843 /// \param __b26
   3844 ///    An 8-bit integral value used to initialize bits [215:208] of the result.
   3845 /// \param __b25
   3846 ///    An 8-bit integral value used to initialize bits [207:200] of the result.
   3847 /// \param __b24
   3848 ///    An 8-bit integral value used to initialize bits [199:192] of the result.
   3849 /// \param __b23
   3850 ///    An 8-bit integral value used to initialize bits [191:184] of the result.
   3851 /// \param __b22
   3852 ///    An 8-bit integral value used to initialize bits [183:176] of the result.
   3853 /// \param __b21
   3854 ///    An 8-bit integral value used to initialize bits [175:168] of the result.
   3855 /// \param __b20
   3856 ///    An 8-bit integral value used to initialize bits [167:160] of the result.
   3857 /// \param __b19
   3858 ///    An 8-bit integral value used to initialize bits [159:152] of the result.
   3859 /// \param __b18
   3860 ///    An 8-bit integral value used to initialize bits [151:144] of the result.
   3861 /// \param __b17
   3862 ///    An 8-bit integral value used to initialize bits [143:136] of the result.
   3863 /// \param __b16
   3864 ///    An 8-bit integral value used to initialize bits [135:128] of the result.
   3865 /// \param __b15
   3866 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
   3867 /// \param __b14
   3868 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
   3869 /// \param __b13
   3870 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
   3871 /// \param __b12
   3872 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
   3873 /// \param __b11
   3874 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
   3875 /// \param __b10
   3876 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
   3877 /// \param __b09
   3878 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
   3879 /// \param __b08
   3880 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
   3881 /// \param __b07
   3882 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
   3883 /// \param __b06
   3884 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
   3885 /// \param __b05
   3886 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
   3887 /// \param __b04
   3888 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
   3889 /// \param __b03
   3890 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
   3891 /// \param __b02
   3892 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
   3893 /// \param __b01
   3894 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
   3895 /// \param __b00
   3896 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
   3897 /// \returns An initialized 256-bit integer vector.
   3898 static __inline __m256i __DEFAULT_FN_ATTRS
   3899 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
   3900                 char __b27, char __b26, char __b25, char __b24,
   3901                 char __b23, char __b22, char __b21, char __b20,
   3902                 char __b19, char __b18, char __b17, char __b16,
   3903                 char __b15, char __b14, char __b13, char __b12,
   3904                 char __b11, char __b10, char __b09, char __b08,
   3905                 char __b07, char __b06, char __b05, char __b04,
   3906                 char __b03, char __b02, char __b01, char __b00)
   3907 {
   3908   return (__m256i)(__v32qi){
   3909     __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
   3910     __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
   3911     __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
   3912     __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
   3913   };
   3914 }
   3915 
   3916 /// \brief Constructs a 256-bit integer vector initialized with the specified
   3917 ///    64-bit integral values.
   3918 ///
   3919 /// \headerfile <x86intrin.h>
   3920 ///
   3921 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
   3922 ///   instruction.
   3923 ///
   3924 /// \param __a
   3925 ///    A 64-bit integral value used to initialize bits [255:192] of the result.
   3926 /// \param __b
   3927 ///    A 64-bit integral value used to initialize bits [191:128] of the result.
   3928 /// \param __c
   3929 ///    A 64-bit integral value used to initialize bits [127:64] of the result.
   3930 /// \param __d
   3931 ///    A 64-bit integral value used to initialize bits [63:0] of the result.
   3932 /// \returns An initialized 256-bit integer vector.
   3933 static __inline __m256i __DEFAULT_FN_ATTRS
   3934 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
   3935 {
   3936   return (__m256i)(__v4di){ __d, __c, __b, __a };
   3937 }
   3938 
   3939 /* Create vectors with elements in reverse order */
   3940 /// \brief Constructs a 256-bit floating-point vector of [4 x double],
   3941 ///    initialized in reverse order with the specified double-precision
   3942 ///    floating-point values.
   3943 ///
   3944 /// \headerfile <x86intrin.h>
   3945 ///
   3946 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
   3947 ///   instruction.
   3948 ///
   3949 /// \param __a
   3950 ///    A double-precision floating-point value used to initialize bits [63:0]
   3951 ///    of the result.
   3952 /// \param __b
   3953 ///    A double-precision floating-point value used to initialize bits [127:64]
   3954 ///    of the result.
   3955 /// \param __c
   3956 ///    A double-precision floating-point value used to initialize bits [191:128]
   3957 ///    of the result.
   3958 /// \param __d
   3959 ///    A double-precision floating-point value used to initialize bits [255:192]
   3960 ///    of the result.
   3961 /// \returns An initialized 256-bit floating-point vector of [4 x double].
   3962 static __inline __m256d __DEFAULT_FN_ATTRS
   3963 _mm256_setr_pd(double __a, double __b, double __c, double __d)
   3964 {
   3965   return (__m256d){ __a, __b, __c, __d };
   3966 }
   3967 
   3968 /// \brief Constructs a 256-bit floating-point vector of [8 x float],
   3969 ///    initialized in reverse order with the specified single-precision
   3970 ///    float-point values.
   3971 ///
   3972 /// \headerfile <x86intrin.h>
   3973 ///
   3974 /// This intrinsic is a utility function and does not correspond to a specific
   3975 ///   instruction.
   3976 ///
   3977 /// \param __a
   3978 ///    A single-precision floating-point value used to initialize bits [31:0]
   3979 ///    of the result.
   3980 /// \param __b
   3981 ///    A single-precision floating-point value used to initialize bits [63:32]
   3982 ///    of the result.
   3983 /// \param __c
   3984 ///    A single-precision floating-point value used to initialize bits [95:64]
   3985 ///    of the result.
   3986 /// \param __d
   3987 ///    A single-precision floating-point value used to initialize bits [127:96]
   3988 ///    of the result.
   3989 /// \param __e
   3990 ///    A single-precision floating-point value used to initialize bits [159:128]
   3991 ///    of the result.
   3992 /// \param __f
   3993 ///    A single-precision floating-point value used to initialize bits [191:160]
   3994 ///    of the result.
   3995 /// \param __g
   3996 ///    A single-precision floating-point value used to initialize bits [223:192]
   3997 ///    of the result.
   3998 /// \param __h
   3999 ///    A single-precision floating-point value used to initialize bits [255:224]
   4000 ///    of the result.
   4001 /// \returns An initialized 256-bit floating-point vector of [8 x float].
   4002 static __inline __m256 __DEFAULT_FN_ATTRS
   4003 _mm256_setr_ps(float __a, float __b, float __c, float __d,
   4004                float __e, float __f, float __g, float __h)
   4005 {
   4006   return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
   4007 }
   4008 
   4009 /// \brief Constructs a 256-bit integer vector, initialized in reverse order
   4010 ///    with the specified 32-bit integral values.
   4011 ///
   4012 /// \headerfile <x86intrin.h>
   4013 ///
   4014 /// This intrinsic is a utility function and does not correspond to a specific
   4015 ///   instruction.
   4016 ///
   4017 /// \param __i0
   4018 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
   4019 /// \param __i1
   4020 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
   4021 /// \param __i2
   4022 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
   4023 /// \param __i3
   4024 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
   4025 /// \param __i4
   4026 ///    A 32-bit integral value used to initialize bits [159:128] of the result.
   4027 /// \param __i5
   4028 ///    A 32-bit integral value used to initialize bits [191:160] of the result.
   4029 /// \param __i6
   4030 ///    A 32-bit integral value used to initialize bits [223:192] of the result.
   4031 /// \param __i7
   4032 ///    A 32-bit integral value used to initialize bits [255:224] of the result.
   4033 /// \returns An initialized 256-bit integer vector.
   4034 static __inline __m256i __DEFAULT_FN_ATTRS
   4035 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
   4036                   int __i4, int __i5, int __i6, int __i7)
   4037 {
   4038   return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
   4039 }
   4040 
   4041 /// \brief Constructs a 256-bit integer vector, initialized in reverse order
   4042 ///    with the specified 16-bit integral values.
   4043 ///
   4044 /// \headerfile <x86intrin.h>
   4045 ///
   4046 /// This intrinsic is a utility function and does not correspond to a specific
   4047 ///   instruction.
   4048 ///
   4049 /// \param __w15
   4050 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
   4051 /// \param __w14
   4052 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
   4053 /// \param __w13
   4054 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
   4055 /// \param __w12
   4056 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
   4057 /// \param __w11
   4058 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
   4059 /// \param __w10
   4060 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
   4061 /// \param __w09
   4062 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
   4063 /// \param __w08
   4064 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
   4065 /// \param __w07
   4066 ///    A 16-bit integral value used to initialize bits [143:128] of the result.
   4067 /// \param __w06
   4068 ///    A 16-bit integral value used to initialize bits [159:144] of the result.
   4069 /// \param __w05
   4070 ///    A 16-bit integral value used to initialize bits [175:160] of the result.
   4071 /// \param __w04
   4072 ///    A 16-bit integral value used to initialize bits [191:176] of the result.
   4073 /// \param __w03
   4074 ///    A 16-bit integral value used to initialize bits [207:192] of the result.
   4075 /// \param __w02
   4076 ///    A 16-bit integral value used to initialize bits [223:208] of the result.
   4077 /// \param __w01
   4078 ///    A 16-bit integral value used to initialize bits [239:224] of the result.
   4079 /// \param __w00
   4080 ///    A 16-bit integral value used to initialize bits [255:240] of the result.
   4081 /// \returns An initialized 256-bit integer vector.
   4082 static __inline __m256i __DEFAULT_FN_ATTRS
   4083 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
   4084        short __w11, short __w10, short __w09, short __w08,
   4085        short __w07, short __w06, short __w05, short __w04,
   4086        short __w03, short __w02, short __w01, short __w00)
   4087 {
   4088   return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
   4089     __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
   4090 }
   4091 
   4092 /// \brief Constructs a 256-bit integer vector, initialized in reverse order
   4093 ///    with the specified 8-bit integral values.
   4094 ///
   4095 /// \headerfile <x86intrin.h>
   4096 ///
   4097 /// This intrinsic is a utility function and does not correspond to a specific
   4098 ///   instruction.
   4099 ///
   4100 /// \param __b31
   4101 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
   4102 /// \param __b30
   4103 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
   4104 /// \param __b29
   4105 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
   4106 /// \param __b28
   4107 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
   4108 /// \param __b27
   4109 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
   4110 /// \param __b26
   4111 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
   4112 /// \param __b25
   4113 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
   4114 /// \param __b24
   4115 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
   4116 /// \param __b23
   4117 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
   4118 /// \param __b22
   4119 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
   4120 /// \param __b21
   4121 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
   4122 /// \param __b20
   4123 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
   4124 /// \param __b19
   4125 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
   4126 /// \param __b18
   4127 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
   4128 /// \param __b17
   4129 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
   4130 /// \param __b16
   4131 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
   4132 /// \param __b15
   4133 ///    An 8-bit integral value used to initialize bits [135:128] of the result.
   4134 /// \param __b14
   4135 ///    An 8-bit integral value used to initialize bits [143:136] of the result.
   4136 /// \param __b13
   4137 ///    An 8-bit integral value used to initialize bits [151:144] of the result.
   4138 /// \param __b12
   4139 ///    An 8-bit integral value used to initialize bits [159:152] of the result.
   4140 /// \param __b11
   4141 ///    An 8-bit integral value used to initialize bits [167:160] of the result.
   4142 /// \param __b10
   4143 ///    An 8-bit integral value used to initialize bits [175:168] of the result.
   4144 /// \param __b09
   4145 ///    An 8-bit integral value used to initialize bits [183:176] of the result.
   4146 /// \param __b08
   4147 ///    An 8-bit integral value used to initialize bits [191:184] of the result.
   4148 /// \param __b07
   4149 ///    An 8-bit integral value used to initialize bits [199:192] of the result.
   4150 /// \param __b06
   4151 ///    An 8-bit integral value used to initialize bits [207:200] of the result.
   4152 /// \param __b05
   4153 ///    An 8-bit integral value used to initialize bits [215:208] of the result.
   4154 /// \param __b04
   4155 ///    An 8-bit integral value used to initialize bits [223:216] of the result.
   4156 /// \param __b03
   4157 ///    An 8-bit integral value used to initialize bits [231:224] of the result.
   4158 /// \param __b02
   4159 ///    An 8-bit integral value used to initialize bits [239:232] of the result.
   4160 /// \param __b01
   4161 ///    An 8-bit integral value used to initialize bits [247:240] of the result.
   4162 /// \param __b00
   4163 ///    An 8-bit integral value used to initialize bits [255:248] of the result.
   4164 /// \returns An initialized 256-bit integer vector.
   4165 static __inline __m256i __DEFAULT_FN_ATTRS
   4166 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
   4167                  char __b27, char __b26, char __b25, char __b24,
   4168                  char __b23, char __b22, char __b21, char __b20,
   4169                  char __b19, char __b18, char __b17, char __b16,
   4170                  char __b15, char __b14, char __b13, char __b12,
   4171                  char __b11, char __b10, char __b09, char __b08,
   4172                  char __b07, char __b06, char __b05, char __b04,
   4173                  char __b03, char __b02, char __b01, char __b00)
   4174 {
   4175   return (__m256i)(__v32qi){
   4176     __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
   4177     __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
   4178     __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
   4179     __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
   4180 }
   4181 
   4182 /// \brief Constructs a 256-bit integer vector, initialized in reverse order
   4183 ///    with the specified 64-bit integral values.
   4184 ///
   4185 /// \headerfile <x86intrin.h>
   4186 ///
   4187 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
   4188 ///   instruction.
   4189 ///
   4190 /// \param __a
   4191 ///    A 64-bit integral value used to initialize bits [63:0] of the result.
   4192 /// \param __b
   4193 ///    A 64-bit integral value used to initialize bits [127:64] of the result.
   4194 /// \param __c
   4195 ///    A 64-bit integral value used to initialize bits [191:128] of the result.
   4196 /// \param __d
   4197 ///    A 64-bit integral value used to initialize bits [255:192] of the result.
   4198 /// \returns An initialized 256-bit integer vector.
   4199 static __inline __m256i __DEFAULT_FN_ATTRS
   4200 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
   4201 {
   4202   return (__m256i)(__v4di){ __a, __b, __c, __d };
   4203 }
   4204 
   4205 /* Create vectors with repeated elements */
   4206 /// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
   4207 ///    of the four double-precision floating-point vector elements set to the
   4208 ///    specified double-precision floating-point value.
   4209 ///
   4210 /// \headerfile <x86intrin.h>
   4211 ///
   4212 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
   4213 ///
   4214 /// \param __w
   4215 ///    A double-precision floating-point value used to initialize each vector
   4216 ///    element of the result.
   4217 /// \returns An initialized 256-bit floating-point vector of [4 x double].
   4218 static __inline __m256d __DEFAULT_FN_ATTRS
   4219 _mm256_set1_pd(double __w)
   4220 {
   4221   return (__m256d){ __w, __w, __w, __w };
   4222 }
   4223 
   4224 /// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
   4225 ///    of the eight single-precision floating-point vector elements set to the
   4226 ///    specified single-precision floating-point value.
   4227 ///
   4228 /// \headerfile <x86intrin.h>
   4229 ///
   4230 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
   4231 ///   instruction.
   4232 ///
   4233 /// \param __w
   4234 ///    A single-precision floating-point value used to initialize each vector
   4235 ///    element of the result.
   4236 /// \returns An initialized 256-bit floating-point vector of [8 x float].
   4237 static __inline __m256 __DEFAULT_FN_ATTRS
   4238 _mm256_set1_ps(float __w)
   4239 {
   4240   return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
   4241 }
   4242 
   4243 /// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
   4244 ///    32-bit integral vector elements set to the specified 32-bit integral
   4245 ///    value.
   4246 ///
   4247 /// \headerfile <x86intrin.h>
   4248 ///
   4249 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
   4250 ///   instruction.
   4251 ///
   4252 /// \param __i
   4253 ///    A 32-bit integral value used to initialize each vector element of the
   4254 ///    result.
   4255 /// \returns An initialized 256-bit integer vector of [8 x i32].
   4256 static __inline __m256i __DEFAULT_FN_ATTRS
   4257 _mm256_set1_epi32(int __i)
   4258 {
   4259   return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
   4260 }
   4261 
   4262 /// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
   4263 ///    16-bit integral vector elements set to the specified 16-bit integral
   4264 ///    value.
   4265 ///
   4266 /// \headerfile <x86intrin.h>
   4267 ///
   4268 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
   4269 ///
   4270 /// \param __w
   4271 ///    A 16-bit integral value used to initialize each vector element of the
   4272 ///    result.
   4273 /// \returns An initialized 256-bit integer vector of [16 x i16].
   4274 static __inline __m256i __DEFAULT_FN_ATTRS
   4275 _mm256_set1_epi16(short __w)
   4276 {
   4277   return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
   4278     __w, __w, __w, __w, __w, __w };
   4279 }
   4280 
   4281 /// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
   4282 ///    8-bit integral vector elements set to the specified 8-bit integral value.
   4283 ///
   4284 /// \headerfile <x86intrin.h>
   4285 ///
   4286 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
   4287 ///
   4288 /// \param __b
   4289 ///    An 8-bit integral value used to initialize each vector element of the
   4290 ///    result.
   4291 /// \returns An initialized 256-bit integer vector of [32 x i8].
   4292 static __inline __m256i __DEFAULT_FN_ATTRS
   4293 _mm256_set1_epi8(char __b)
   4294 {
   4295   return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
   4296     __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
   4297     __b, __b, __b, __b, __b, __b, __b };
   4298 }
   4299 
   4300 /// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
   4301 ///    64-bit integral vector elements set to the specified 64-bit integral
   4302 ///    value.
   4303 ///
   4304 /// \headerfile <x86intrin.h>
   4305 ///
   4306 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
   4307 ///
   4308 /// \param __q
   4309 ///    A 64-bit integral value used to initialize each vector element of the
   4310 ///    result.
   4311 /// \returns An initialized 256-bit integer vector of [4 x i64].
   4312 static __inline __m256i __DEFAULT_FN_ATTRS
   4313 _mm256_set1_epi64x(long long __q)
   4314 {
   4315   return (__m256i)(__v4di){ __q, __q, __q, __q };
   4316 }
   4317 
   4318 /* Create __zeroed vectors */
   4319 /// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
   4320 ///    vector elements initialized to zero.
   4321 ///
   4322 /// \headerfile <x86intrin.h>
   4323 ///
   4324 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
   4325 ///
   4326 /// \returns A 256-bit vector of [4 x double] with all elements set to zero.
   4327 static __inline __m256d __DEFAULT_FN_ATTRS
   4328 _mm256_setzero_pd(void)
   4329 {
   4330   return (__m256d){ 0, 0, 0, 0 };
   4331 }
   4332 
   4333 /// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
   4334 ///    vector elements initialized to zero.
   4335 ///
   4336 /// \headerfile <x86intrin.h>
   4337 ///
   4338 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
   4339 ///
   4340 /// \returns A 256-bit vector of [8 x float] with all elements set to zero.
   4341 static __inline __m256 __DEFAULT_FN_ATTRS
   4342 _mm256_setzero_ps(void)
   4343 {
   4344   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
   4345 }
   4346 
   4347 /// \brief Constructs a 256-bit integer vector initialized to zero.
   4348 ///
   4349 /// \headerfile <x86intrin.h>
   4350 ///
   4351 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
   4352 ///
   4353 /// \returns A 256-bit integer vector initialized to zero.
   4354 static __inline __m256i __DEFAULT_FN_ATTRS
   4355 _mm256_setzero_si256(void)
   4356 {
   4357   return (__m256i){ 0LL, 0LL, 0LL, 0LL };
   4358 }
   4359 
   4360 /* Cast between vector types */
   4361 /// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
   4362 ///    floating-point vector of [8 x float].
   4363 ///
   4364 /// \headerfile <x86intrin.h>
   4365 ///
   4366 /// This intrinsic has no corresponding instruction.
   4367 ///
   4368 /// \param __a
   4369 ///    A 256-bit floating-point vector of [4 x double].
   4370 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
   4371 ///    bitwise pattern as the parameter.
   4372 static __inline __m256 __DEFAULT_FN_ATTRS
   4373 _mm256_castpd_ps(__m256d __a)
   4374 {
   4375   return (__m256)__a;
   4376 }
   4377 
   4378 /// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
   4379 ///    integer vector.
   4380 ///
   4381 /// \headerfile <x86intrin.h>
   4382 ///
   4383 /// This intrinsic has no corresponding instruction.
   4384 ///
   4385 /// \param __a
   4386 ///    A 256-bit floating-point vector of [4 x double].
   4387 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
   4388 ///    parameter.
   4389 static __inline __m256i __DEFAULT_FN_ATTRS
   4390 _mm256_castpd_si256(__m256d __a)
   4391 {
   4392   return (__m256i)__a;
   4393 }
   4394 
   4395 /// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
   4396 ///    floating-point vector of [4 x double].
   4397 ///
   4398 /// \headerfile <x86intrin.h>
   4399 ///
   4400 /// This intrinsic has no corresponding instruction.
   4401 ///
   4402 /// \param __a
   4403 ///    A 256-bit floating-point vector of [8 x float].
   4404 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
   4405 ///    bitwise pattern as the parameter.
   4406 static __inline __m256d __DEFAULT_FN_ATTRS
   4407 _mm256_castps_pd(__m256 __a)
   4408 {
   4409   return (__m256d)__a;
   4410 }
   4411 
   4412 /// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
   4413 ///    integer vector.
   4414 ///
   4415 /// \headerfile <x86intrin.h>
   4416 ///
   4417 /// This intrinsic has no corresponding instruction.
   4418 ///
   4419 /// \param __a
   4420 ///    A 256-bit floating-point vector of [8 x float].
   4421 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
   4422 ///    parameter.
   4423 static __inline __m256i __DEFAULT_FN_ATTRS
   4424 _mm256_castps_si256(__m256 __a)
   4425 {
   4426   return (__m256i)__a;
   4427 }
   4428 
   4429 /// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
   4430 ///    of [8 x float].
   4431 ///
   4432 /// \headerfile <x86intrin.h>
   4433 ///
   4434 /// This intrinsic has no corresponding instruction.
   4435 ///
   4436 /// \param __a
   4437 ///    A 256-bit integer vector.
   4438 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
   4439 ///    bitwise pattern as the parameter.
   4440 static __inline __m256 __DEFAULT_FN_ATTRS
   4441 _mm256_castsi256_ps(__m256i __a)
   4442 {
   4443   return (__m256)__a;
   4444 }
   4445 
   4446 /// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
   4447 ///    of [4 x double].
   4448 ///
   4449 /// \headerfile <x86intrin.h>
   4450 ///
   4451 /// This intrinsic has no corresponding instruction.
   4452 ///
   4453 /// \param __a
   4454 ///    A 256-bit integer vector.
   4455 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
   4456 ///    bitwise pattern as the parameter.
   4457 static __inline __m256d __DEFAULT_FN_ATTRS
   4458 _mm256_castsi256_pd(__m256i __a)
   4459 {
   4460   return (__m256d)__a;
   4461 }
   4462 
   4463 /// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
   4464 ///    [4 x double] as a 128-bit floating-point vector of [2 x double].
   4465 ///
   4466 /// \headerfile <x86intrin.h>
   4467 ///
   4468 /// This intrinsic has no corresponding instruction.
   4469 ///
   4470 /// \param __a
   4471 ///    A 256-bit floating-point vector of [4 x double].
   4472 /// \returns A 128-bit floating-point vector of [2 x double] containing the
   4473 ///    lower 128 bits of the parameter.
   4474 static __inline __m128d __DEFAULT_FN_ATTRS
   4475 _mm256_castpd256_pd128(__m256d __a)
   4476 {
   4477   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
   4478 }
   4479 
   4480 /// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
   4481 ///    [8 x float] as a 128-bit floating-point vector of [4 x float].
   4482 ///
   4483 /// \headerfile <x86intrin.h>
   4484 ///
   4485 /// This intrinsic has no corresponding instruction.
   4486 ///
   4487 /// \param __a
   4488 ///    A 256-bit floating-point vector of [8 x float].
   4489 /// \returns A 128-bit floating-point vector of [4 x float] containing the
   4490 ///    lower 128 bits of the parameter.
   4491 static __inline __m128 __DEFAULT_FN_ATTRS
   4492 _mm256_castps256_ps128(__m256 __a)
   4493 {
   4494   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
   4495 }
   4496 
   4497 /// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
   4498 ///
   4499 /// \headerfile <x86intrin.h>
   4500 ///
   4501 /// This intrinsic has no corresponding instruction.
   4502 ///
   4503 /// \param __a
   4504 ///    A 256-bit integer vector.
   4505 /// \returns A 128-bit integer vector containing the lower 128 bits of the
   4506 ///    parameter.
   4507 static __inline __m128i __DEFAULT_FN_ATTRS
   4508 _mm256_castsi256_si128(__m256i __a)
   4509 {
   4510   return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
   4511 }
   4512 
   4513 /// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
   4514 ///    128-bit floating-point vector of [2 x double].
   4515 ///
   4516 ///    The lower 128 bits contain the value of the source vector. The contents
   4517 ///    of the upper 128 bits are undefined.
   4518 ///
   4519 /// \headerfile <x86intrin.h>
   4520 ///
   4521 /// This intrinsic has no corresponding instruction.
   4522 ///
   4523 /// \param __a
   4524 ///    A 128-bit vector of [2 x double].
   4525 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
   4526 ///    contain the value of the parameter. The contents of the upper 128 bits
   4527 ///    are undefined.
   4528 static __inline __m256d __DEFAULT_FN_ATTRS
   4529 _mm256_castpd128_pd256(__m128d __a)
   4530 {
   4531   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
   4532 }
   4533 
   4534 /// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
   4535 ///    128-bit floating-point vector of [4 x float].
   4536 ///
   4537 ///    The lower 128 bits contain the value of the source vector. The contents
   4538 ///    of the upper 128 bits are undefined.
   4539 ///
   4540 /// \headerfile <x86intrin.h>
   4541 ///
   4542 /// This intrinsic has no corresponding instruction.
   4543 ///
   4544 /// \param __a
   4545 ///    A 128-bit vector of [4 x float].
   4546 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
   4547 ///    contain the value of the parameter. The contents of the upper 128 bits
   4548 ///    are undefined.
   4549 static __inline __m256 __DEFAULT_FN_ATTRS
   4550 _mm256_castps128_ps256(__m128 __a)
   4551 {
   4552   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
   4553 }
   4554 
   4555 /// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
   4556 ///
   4557 ///    The lower 128 bits contain the value of the source vector. The contents
   4558 ///    of the upper 128 bits are undefined.
   4559 ///
   4560 /// \headerfile <x86intrin.h>
   4561 ///
   4562 /// This intrinsic has no corresponding instruction.
   4563 ///
   4564 /// \param __a
   4565 ///    A 128-bit integer vector.
   4566 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of
   4567 ///    the parameter. The contents of the upper 128 bits are undefined.
   4568 static __inline __m256i __DEFAULT_FN_ATTRS
   4569 _mm256_castsi128_si256(__m128i __a)
   4570 {
   4571   return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
   4572 }
   4573 
   4574 /// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
   4575 ///    128-bit floating-point vector of [2 x double]. The lower 128 bits
   4576 ///    contain the value of the source vector. The upper 128 bits are set
   4577 ///    to zero.
   4578 ///
   4579 /// \headerfile <x86intrin.h>
   4580 ///
   4581 /// This intrinsic has no corresponding instruction.
   4582 ///
   4583 /// \param __a
   4584 ///    A 128-bit vector of [2 x double].
   4585 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
   4586 ///    contain the value of the parameter. The upper 128 bits are set to zero.
   4587 static __inline __m256d __DEFAULT_FN_ATTRS
   4588 _mm256_zextpd128_pd256(__m128d __a)
   4589 {
   4590   return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
   4591 }
   4592 
   4593 /// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
   4594 ///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
   4595 ///    the value of the source vector. The upper 128 bits are set to zero.
   4596 ///
   4597 /// \headerfile <x86intrin.h>
   4598 ///
   4599 /// This intrinsic has no corresponding instruction.
   4600 ///
   4601 /// \param __a
   4602 ///    A 128-bit vector of [4 x float].
   4603 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
   4604 ///    contain the value of the parameter. The upper 128 bits are set to zero.
   4605 static __inline __m256 __DEFAULT_FN_ATTRS
   4606 _mm256_zextps128_ps256(__m128 __a)
   4607 {
   4608   return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
   4609 }
   4610 
   4611 /// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
   4612 ///    The lower 128 bits contain the value of the source vector. The upper
   4613 ///    128 bits are set to zero.
   4614 ///
   4615 /// \headerfile <x86intrin.h>
   4616 ///
   4617 /// This intrinsic has no corresponding instruction.
   4618 ///
   4619 /// \param __a
   4620 ///    A 128-bit integer vector.
   4621 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of
   4622 ///    the parameter. The upper 128 bits are set to zero.
   4623 static __inline __m256i __DEFAULT_FN_ATTRS
   4624 _mm256_zextsi128_si256(__m128i __a)
   4625 {
   4626   return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
   4627 }
   4628 
   4629 /*
   4630    Vector insert.
   4631    We use macros rather than inlines because we only want to accept
   4632    invocations where the immediate M is a constant expression.
   4633 */
   4634 /// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
   4635 ///    a 256-bit vector of [8 x float] given in the first parameter, and then
   4636 ///    replacing either the upper or the lower 128 bits with the contents of a
   4637 ///    128-bit vector of [4 x float] in the second parameter.
   4638 ///
   4639 ///    The immediate integer parameter determines between the upper or the lower
   4640 ///    128 bits.
   4641 ///
   4642 /// \headerfile <x86intrin.h>
   4643 ///
   4644 /// \code
   4645 /// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
   4646 /// \endcode
   4647 ///
   4648 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4649 ///
   4650 /// \param V1
   4651 ///    A 256-bit vector of [8 x float]. This vector is copied to the result
   4652 ///    first, and then either the upper or the lower 128 bits of the result will
   4653 ///    be replaced by the contents of \a V2.
   4654 /// \param V2
   4655 ///    A 128-bit vector of [4 x float]. The contents of this parameter are
   4656 ///    written to either the upper or the lower 128 bits of the result depending
   4657 ///    on the value of parameter \a M.
   4658 /// \param M
   4659 ///    An immediate integer. The least significant bit determines how the values
   4660 ///    from the two parameters are interleaved: \n
   4661 ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
   4662 ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
   4663 ///    result. \n
   4664 ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
   4665 ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
   4666 ///    result.
   4667 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
   4668 #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
   4669   (__m256)__builtin_shufflevector( \
   4670     (__v8sf)(__m256)(V1), \
   4671     (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
   4672     (((M) & 1) ?  0 :  8), \
   4673     (((M) & 1) ?  1 :  9), \
   4674     (((M) & 1) ?  2 : 10), \
   4675     (((M) & 1) ?  3 : 11), \
   4676     (((M) & 1) ?  8 :  4), \
   4677     (((M) & 1) ?  9 :  5), \
   4678     (((M) & 1) ? 10 :  6), \
   4679     (((M) & 1) ? 11 :  7) );})
   4680 
   4681 /// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
   4682 ///    a 256-bit vector of [4 x double] given in the first parameter, and then
   4683 ///    replacing either the upper or the lower 128 bits with the contents of a
   4684 ///    128-bit vector of [2 x double] in the second parameter.
   4685 ///
   4686 ///    The immediate integer parameter determines between the upper or the lower
   4687 ///    128 bits.
   4688 ///
   4689 /// \headerfile <x86intrin.h>
   4690 ///
   4691 /// \code
   4692 /// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
   4693 /// \endcode
   4694 ///
   4695 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4696 ///
   4697 /// \param V1
   4698 ///    A 256-bit vector of [4 x double]. This vector is copied to the result
   4699 ///    first, and then either the upper or the lower 128 bits of the result will
   4700 ///    be replaced by the contents of \a V2.
   4701 /// \param V2
   4702 ///    A 128-bit vector of [2 x double]. The contents of this parameter are
   4703 ///    written to either the upper or the lower 128 bits of the result depending
   4704 ///    on the value of parameter \a M.
   4705 /// \param M
   4706 ///    An immediate integer. The least significant bit determines how the values
   4707 ///    from the two parameters are interleaved: \n
   4708 ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
   4709 ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
   4710 ///    result. \n
   4711 ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
   4712 ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
   4713 ///    result.
   4714 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
   4715 #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
   4716   (__m256d)__builtin_shufflevector( \
   4717     (__v4df)(__m256d)(V1), \
   4718     (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
   4719     (((M) & 1) ? 0 : 4), \
   4720     (((M) & 1) ? 1 : 5), \
   4721     (((M) & 1) ? 4 : 2), \
   4722     (((M) & 1) ? 5 : 3) );})
   4723 
   4724 /// \brief Constructs a new 256-bit integer vector by first duplicating a
   4725 ///    256-bit integer vector given in the first parameter, and then replacing
   4726 ///    either the upper or the lower 128 bits with the contents of a 128-bit
   4727 ///    integer vector in the second parameter.
   4728 ///
   4729 ///    The immediate integer parameter determines between the upper or the lower
   4730 ///    128 bits.
   4731 ///
   4732 /// \headerfile <x86intrin.h>
   4733 ///
   4734 /// \code
   4735 /// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
   4736 /// \endcode
   4737 ///
   4738 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4739 ///
   4740 /// \param V1
   4741 ///    A 256-bit integer vector. This vector is copied to the result first, and
   4742 ///    then either the upper or the lower 128 bits of the result will be
   4743 ///    replaced by the contents of \a V2.
   4744 /// \param V2
   4745 ///    A 128-bit integer vector. The contents of this parameter are written to
   4746 ///    either the upper or the lower 128 bits of the result depending on the
   4747 ///     value of parameter \a M.
   4748 /// \param M
   4749 ///    An immediate integer. The least significant bit determines how the values
   4750 ///    from the two parameters are interleaved: \n
   4751 ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
   4752 ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
   4753 ///    result. \n
   4754 ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
   4755 ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
   4756 ///    result.
   4757 /// \returns A 256-bit integer vector containing the interleaved values.
   4758 #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
   4759   (__m256i)__builtin_shufflevector( \
   4760     (__v4di)(__m256i)(V1), \
   4761     (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
   4762     (((M) & 1) ? 0 : 4), \
   4763     (((M) & 1) ? 1 : 5), \
   4764     (((M) & 1) ? 4 : 2), \
   4765     (((M) & 1) ? 5 : 3) );})
   4766 
   4767 /*
   4768    Vector extract.
   4769    We use macros rather than inlines because we only want to accept
   4770    invocations where the immediate M is a constant expression.
   4771 */
   4772 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
   4773 ///    of [8 x float], as determined by the immediate integer parameter, and
   4774 ///    returns the extracted bits as a 128-bit vector of [4 x float].
   4775 ///
   4776 /// \headerfile <x86intrin.h>
   4777 ///
   4778 /// \code
   4779 /// __m128 _mm256_extractf128_ps(__m256 V, const int M);
   4780 /// \endcode
   4781 ///
   4782 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
   4783 ///
   4784 /// \param V
   4785 ///    A 256-bit vector of [8 x float].
   4786 /// \param M
   4787 ///    An immediate integer. The least significant bit determines which bits are
   4788 ///    extracted from the first parameter: \n
   4789 ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
   4790 ///    result. \n
   4791 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
   4792 /// \returns A 128-bit vector of [4 x float] containing the extracted bits.
   4793 #define _mm256_extractf128_ps(V, M) __extension__ ({ \
   4794   (__m128)__builtin_shufflevector( \
   4795     (__v8sf)(__m256)(V), \
   4796     (__v8sf)(_mm256_undefined_ps()), \
   4797     (((M) & 1) ? 4 : 0), \
   4798     (((M) & 1) ? 5 : 1), \
   4799     (((M) & 1) ? 6 : 2), \
   4800     (((M) & 1) ? 7 : 3) );})
   4801 
   4802 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
   4803 ///    of [4 x double], as determined by the immediate integer parameter, and
   4804 ///    returns the extracted bits as a 128-bit vector of [2 x double].
   4805 ///
   4806 /// \headerfile <x86intrin.h>
   4807 ///
   4808 /// \code
   4809 /// __m128d _mm256_extractf128_pd(__m256d V, const int M);
   4810 /// \endcode
   4811 ///
   4812 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
   4813 ///
   4814 /// \param V
   4815 ///    A 256-bit vector of [4 x double].
   4816 /// \param M
   4817 ///    An immediate integer. The least significant bit determines which bits are
   4818 ///    extracted from the first parameter: \n
   4819 ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
   4820 ///    result. \n
   4821 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
   4822 /// \returns A 128-bit vector of [2 x double] containing the extracted bits.
   4823 #define _mm256_extractf128_pd(V, M) __extension__ ({ \
   4824   (__m128d)__builtin_shufflevector( \
   4825     (__v4df)(__m256d)(V), \
   4826     (__v4df)(_mm256_undefined_pd()), \
   4827     (((M) & 1) ? 2 : 0), \
   4828     (((M) & 1) ? 3 : 1) );})
   4829 
   4830 /// \brief Extracts either the upper or the lower 128 bits from a 256-bit
   4831 ///    integer vector, as determined by the immediate integer parameter, and
   4832 ///    returns the extracted bits as a 128-bit integer vector.
   4833 ///
   4834 /// \headerfile <x86intrin.h>
   4835 ///
   4836 /// \code
   4837 /// __m128i _mm256_extractf128_si256(__m256i V, const int M);
   4838 /// \endcode
   4839 ///
   4840 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
   4841 ///
   4842 /// \param V
   4843 ///    A 256-bit integer vector.
   4844 /// \param M
   4845 ///    An immediate integer. The least significant bit determines which bits are
   4846 ///    extracted from the first parameter:  \n
   4847 ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
   4848 ///    result. \n
   4849 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
   4850 /// \returns A 128-bit integer vector containing the extracted bits.
   4851 #define _mm256_extractf128_si256(V, M) __extension__ ({ \
   4852   (__m128i)__builtin_shufflevector( \
   4853     (__v4di)(__m256i)(V), \
   4854     (__v4di)(_mm256_undefined_si256()), \
   4855     (((M) & 1) ? 2 : 0), \
   4856     (((M) & 1) ? 3 : 1) );})
   4857 
   4858 /* SIMD load ops (unaligned) */
   4859 /// \brief Loads two 128-bit floating-point vectors of [4 x float] from
   4860 ///    unaligned memory locations and constructs a 256-bit floating-point vector
   4861 ///    of [8 x float] by concatenating the two 128-bit vectors.
   4862 ///
   4863 /// \headerfile <x86intrin.h>
   4864 ///
   4865 /// This intrinsic corresponds to load instructions followed by the
   4866 ///   <c> VINSERTF128 </c> instruction.
   4867 ///
   4868 /// \param __addr_hi
   4869 ///    A pointer to a 128-bit memory location containing 4 consecutive
   4870 ///    single-precision floating-point values. These values are to be copied to
   4871 ///    bits[255:128] of the result. The address of the memory location does not
   4872 ///    have to be aligned.
   4873 /// \param __addr_lo
   4874 ///    A pointer to a 128-bit memory location containing 4 consecutive
   4875 ///    single-precision floating-point values. These values are to be copied to
   4876 ///    bits[127:0] of the result. The address of the memory location does not
   4877 ///    have to be aligned.
   4878 /// \returns A 256-bit floating-point vector of [8 x float] containing the
   4879 ///    concatenated result.
   4880 static __inline __m256 __DEFAULT_FN_ATTRS
   4881 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
   4882 {
   4883   __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
   4884   return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
   4885 }
   4886 
   4887 /// \brief Loads two 128-bit floating-point vectors of [2 x double] from
   4888 ///    unaligned memory locations and constructs a 256-bit floating-point vector
   4889 ///    of [4 x double] by concatenating the two 128-bit vectors.
   4890 ///
   4891 /// \headerfile <x86intrin.h>
   4892 ///
   4893 /// This intrinsic corresponds to load instructions followed by the
   4894 ///   <c> VINSERTF128 </c> instruction.
   4895 ///
   4896 /// \param __addr_hi
   4897 ///    A pointer to a 128-bit memory location containing two consecutive
   4898 ///    double-precision floating-point values. These values are to be copied to
   4899 ///    bits[255:128] of the result. The address of the memory location does not
   4900 ///    have to be aligned.
   4901 /// \param __addr_lo
   4902 ///    A pointer to a 128-bit memory location containing two consecutive
   4903 ///    double-precision floating-point values. These values are to be copied to
   4904 ///    bits[127:0] of the result. The address of the memory location does not
   4905 ///    have to be aligned.
   4906 /// \returns A 256-bit floating-point vector of [4 x double] containing the
   4907 ///    concatenated result.
   4908 static __inline __m256d __DEFAULT_FN_ATTRS
   4909 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
   4910 {
   4911   __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
   4912   return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
   4913 }
   4914 
   4915 /// \brief Loads two 128-bit integer vectors from unaligned memory locations and
   4916 ///    constructs a 256-bit integer vector by concatenating the two 128-bit
   4917 ///    vectors.
   4918 ///
   4919 /// \headerfile <x86intrin.h>
   4920 ///
   4921 /// This intrinsic corresponds to load instructions followed by the
   4922 ///   <c> VINSERTF128 </c> instruction.
   4923 ///
   4924 /// \param __addr_hi
   4925 ///    A pointer to a 128-bit memory location containing a 128-bit integer
   4926 ///    vector. This vector is to be copied to bits[255:128] of the result. The
   4927 ///    address of the memory location does not have to be aligned.
   4928 /// \param __addr_lo
   4929 ///    A pointer to a 128-bit memory location containing a 128-bit integer
   4930 ///    vector. This vector is to be copied to bits[127:0] of the result. The
   4931 ///    address of the memory location does not have to be aligned.
   4932 /// \returns A 256-bit integer vector containing the concatenated result.
   4933 static __inline __m256i __DEFAULT_FN_ATTRS
   4934 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
   4935 {
   4936   __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
   4937   return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
   4938 }
   4939 
   4940 /* SIMD store ops (unaligned) */
   4941 /// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
   4942 ///    vector of [8 x float] into two different unaligned memory locations.
   4943 ///
   4944 /// \headerfile <x86intrin.h>
   4945 ///
   4946 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
   4947 ///   store instructions.
   4948 ///
   4949 /// \param __addr_hi
   4950 ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
   4951 ///    copied to this memory location. The address of this memory location does
   4952 ///    not have to be aligned.
   4953 /// \param __addr_lo
   4954 ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
   4955 ///    copied to this memory location. The address of this memory location does
   4956 ///    not have to be aligned.
   4957 /// \param __a
   4958 ///    A 256-bit floating-point vector of [8 x float].
   4959 static __inline void __DEFAULT_FN_ATTRS
   4960 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
   4961 {
   4962   __m128 __v128;
   4963 
   4964   __v128 = _mm256_castps256_ps128(__a);
   4965   _mm_storeu_ps(__addr_lo, __v128);
   4966   __v128 = _mm256_extractf128_ps(__a, 1);
   4967   _mm_storeu_ps(__addr_hi, __v128);
   4968 }
   4969 
   4970 /// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
   4971 ///    vector of [4 x double] into two different unaligned memory locations.
   4972 ///
   4973 /// \headerfile <x86intrin.h>
   4974 ///
   4975 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
   4976 ///   store instructions.
   4977 ///
   4978 /// \param __addr_hi
   4979 ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
   4980 ///    copied to this memory location. The address of this memory location does
   4981 ///    not have to be aligned.
   4982 /// \param __addr_lo
   4983 ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
   4984 ///    copied to this memory location. The address of this memory location does
   4985 ///    not have to be aligned.
   4986 /// \param __a
   4987 ///    A 256-bit floating-point vector of [4 x double].
   4988 static __inline void __DEFAULT_FN_ATTRS
   4989 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
   4990 {
   4991   __m128d __v128;
   4992 
   4993   __v128 = _mm256_castpd256_pd128(__a);
   4994   _mm_storeu_pd(__addr_lo, __v128);
   4995   __v128 = _mm256_extractf128_pd(__a, 1);
   4996   _mm_storeu_pd(__addr_hi, __v128);
   4997 }
   4998 
   4999 /// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
   5000 ///    two different unaligned memory locations.
   5001 ///
   5002 /// \headerfile <x86intrin.h>
   5003 ///
   5004 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
   5005 ///   store instructions.
   5006 ///
   5007 /// \param __addr_hi
   5008 ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
   5009 ///    copied to this memory location. The address of this memory location does
   5010 ///    not have to be aligned.
   5011 /// \param __addr_lo
   5012 ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
   5013 ///    copied to this memory location. The address of this memory location does
   5014 ///    not have to be aligned.
   5015 /// \param __a
   5016 ///    A 256-bit integer vector.
   5017 static __inline void __DEFAULT_FN_ATTRS
   5018 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
   5019 {
   5020   __m128i __v128;
   5021 
   5022   __v128 = _mm256_castsi256_si128(__a);
   5023   _mm_storeu_si128(__addr_lo, __v128);
   5024   __v128 = _mm256_extractf128_si256(__a, 1);
   5025   _mm_storeu_si128(__addr_hi, __v128);
   5026 }
   5027 
   5028 /// \brief Constructs a 256-bit floating-point vector of [8 x float] by
   5029 ///    concatenating two 128-bit floating-point vectors of [4 x float].
   5030 ///
   5031 /// \headerfile <x86intrin.h>
   5032 ///
   5033 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   5034 ///
   5035 /// \param __hi
   5036 ///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
   5037 ///    128 bits of the result.
   5038 /// \param __lo
   5039 ///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
   5040 ///    128 bits of the result.
   5041 /// \returns A 256-bit floating-point vector of [8 x float] containing the
   5042 ///    concatenated result.
   5043 static __inline __m256 __DEFAULT_FN_ATTRS
   5044 _mm256_set_m128 (__m128 __hi, __m128 __lo)
   5045 {
   5046   return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
   5047 }
   5048 
   5049 /// \brief Constructs a 256-bit floating-point vector of [4 x double] by
   5050 ///    concatenating two 128-bit floating-point vectors of [2 x double].
   5051 ///
   5052 /// \headerfile <x86intrin.h>
   5053 ///
   5054 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   5055 ///
   5056 /// \param __hi
   5057 ///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
   5058 ///    128 bits of the result.
   5059 /// \param __lo
   5060 ///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
   5061 ///    128 bits of the result.
   5062 /// \returns A 256-bit floating-point vector of [4 x double] containing the
   5063 ///    concatenated result.
   5064 static __inline __m256d __DEFAULT_FN_ATTRS
   5065 _mm256_set_m128d (__m128d __hi, __m128d __lo)
   5066 {
   5067   return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
   5068 }
   5069 
   5070 /// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
   5071 ///    integer vectors.
   5072 ///
   5073 /// \headerfile <x86intrin.h>
   5074 ///
   5075 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   5076 ///
   5077 /// \param __hi
   5078 ///    A 128-bit integer vector to be copied to the upper 128 bits of the
   5079 ///    result.
   5080 /// \param __lo
   5081 ///    A 128-bit integer vector to be copied to the lower 128 bits of the
   5082 ///    result.
   5083 /// \returns A 256-bit integer vector containing the concatenated result.
   5084 static __inline __m256i __DEFAULT_FN_ATTRS
   5085 _mm256_set_m128i (__m128i __hi, __m128i __lo)
   5086 {
   5087   return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
   5088 }
   5089 
   5090 /// \brief Constructs a 256-bit floating-point vector of [8 x float] by
   5091 ///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
   5092 ///    similar to _mm256_set_m128, but the order of the input parameters is
   5093 ///    swapped.
   5094 ///
   5095 /// \headerfile <x86intrin.h>
   5096 ///
   5097 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   5098 ///
   5099 /// \param __lo
   5100 ///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
   5101 ///    128 bits of the result.
   5102 /// \param __hi
   5103 ///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
   5104 ///    128 bits of the result.
   5105 /// \returns A 256-bit floating-point vector of [8 x float] containing the
   5106 ///    concatenated result.
   5107 static __inline __m256 __DEFAULT_FN_ATTRS
   5108 _mm256_setr_m128 (__m128 __lo, __m128 __hi)
   5109 {
   5110   return _mm256_set_m128(__hi, __lo);
   5111 }
   5112 
   5113 /// \brief Constructs a 256-bit floating-point vector of [4 x double] by
   5114 ///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
   5115 ///    similar to _mm256_set_m128d, but the order of the input parameters is
   5116 ///    swapped.
   5117 ///
   5118 /// \headerfile <x86intrin.h>
   5119 ///
   5120 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   5121 ///
   5122 /// \param __lo
   5123 ///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
   5124 ///    128 bits of the result.
   5125 /// \param __hi
   5126 ///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
   5127 ///    128 bits of the result.
   5128 /// \returns A 256-bit floating-point vector of [4 x double] containing the
   5129 ///    concatenated result.
   5130 static __inline __m256d __DEFAULT_FN_ATTRS
   5131 _mm256_setr_m128d (__m128d __lo, __m128d __hi)
   5132 {
   5133   return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
   5134 }
   5135 
   5136 /// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
   5137 ///    integer vectors. This is similar to _mm256_set_m128i, but the order of
   5138 ///    the input parameters is swapped.
   5139 ///
   5140 /// \headerfile <x86intrin.h>
   5141 ///
   5142 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   5143 ///
   5144 /// \param __lo
   5145 ///    A 128-bit integer vector to be copied to the lower 128 bits of the
   5146 ///    result.
   5147 /// \param __hi
   5148 ///    A 128-bit integer vector to be copied to the upper 128 bits of the
   5149 ///    result.
   5150 /// \returns A 256-bit integer vector containing the concatenated result.
   5151 static __inline __m256i __DEFAULT_FN_ATTRS
   5152 _mm256_setr_m128i (__m128i __lo, __m128i __hi)
   5153 {
   5154   return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
   5155 }
   5156 
   5157 #undef __DEFAULT_FN_ATTRS
   5158 
   5159 #endif /* __AVXINTRIN_H */
   5160