Home | History | Annotate | Download | only in include
      1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __EMMINTRIN_H
     25 #define __EMMINTRIN_H
     26 
     27 #include <xmmintrin.h>
     28 
     29 typedef double __m128d __attribute__((__vector_size__(16)));
     30 typedef long long __m128i __attribute__((__vector_size__(16)));
     31 
     32 /* Type defines.  */
     33 typedef double __v2df __attribute__ ((__vector_size__ (16)));
     34 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
     35 typedef short __v8hi __attribute__((__vector_size__(16)));
     36 typedef char __v16qi __attribute__((__vector_size__(16)));
     37 
     38 /* Unsigned types */
     39 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
     40 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
     41 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
     42 
     43 /* We need an explicitly signed variant for char. Note that this shouldn't
     44  * appear in the interface though. */
     45 typedef signed char __v16qs __attribute__((__vector_size__(16)));
     46 
     47 #include <f16cintrin.h>
     48 
     49 /* Define the default attributes for the functions in this file. */
     50 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
     51 
     52 /// \brief Adds lower double-precision values in both operands and returns the
     53 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
     54 ///    are copied from the upper double-precision value of the first operand.
     55 ///
     56 /// \headerfile <x86intrin.h>
     57 ///
     58 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
     59 ///
     60 /// \param __a
     61 ///    A 128-bit vector of [2 x double] containing one of the source operands.
     62 /// \param __b
     63 ///    A 128-bit vector of [2 x double] containing one of the source operands.
     64 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
     65 ///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
     66 ///    from the upper 64 bits of the first source operand.
     67 static __inline__ __m128d __DEFAULT_FN_ATTRS
     68 _mm_add_sd(__m128d __a, __m128d __b)
     69 {
     70   __a[0] += __b[0];
     71   return __a;
     72 }
     73 
     74 /// \brief Adds two 128-bit vectors of [2 x double].
     75 ///
     76 /// \headerfile <x86intrin.h>
     77 ///
     78 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
     79 ///
     80 /// \param __a
     81 ///    A 128-bit vector of [2 x double] containing one of the source operands.
     82 /// \param __b
     83 ///    A 128-bit vector of [2 x double] containing one of the source operands.
     84 /// \returns A 128-bit vector of [2 x double] containing the sums of both
     85 ///    operands.
     86 static __inline__ __m128d __DEFAULT_FN_ATTRS
     87 _mm_add_pd(__m128d __a, __m128d __b)
     88 {
     89   return (__m128d)((__v2df)__a + (__v2df)__b);
     90 }
     91 
     92 /// \brief Subtracts the lower double-precision value of the second operand
     93 ///    from the lower double-precision value of the first operand and returns
     94 ///    the difference in the lower 64 bits of the result. The upper 64 bits of
     95 ///    the result are copied from the upper double-precision value of the first
     96 ///    operand.
     97 ///
     98 /// \headerfile <x86intrin.h>
     99 ///
    100 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
    101 ///
    102 /// \param __a
    103 ///    A 128-bit vector of [2 x double] containing the minuend.
    104 /// \param __b
    105 ///    A 128-bit vector of [2 x double] containing the subtrahend.
    106 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
    107 ///    difference of the lower 64 bits of both operands. The upper 64 bits are
    108 ///    copied from the upper 64 bits of the first source operand.
    109 static __inline__ __m128d __DEFAULT_FN_ATTRS
    110 _mm_sub_sd(__m128d __a, __m128d __b)
    111 {
    112   __a[0] -= __b[0];
    113   return __a;
    114 }
    115 
    116 /// \brief Subtracts two 128-bit vectors of [2 x double].
    117 ///
    118 /// \headerfile <x86intrin.h>
    119 ///
    120 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
    121 ///
    122 /// \param __a
    123 ///    A 128-bit vector of [2 x double] containing the minuend.
    124 /// \param __b
    125 ///    A 128-bit vector of [2 x double] containing the subtrahend.
    126 /// \returns A 128-bit vector of [2 x double] containing the differences between
    127 ///    both operands.
    128 static __inline__ __m128d __DEFAULT_FN_ATTRS
    129 _mm_sub_pd(__m128d __a, __m128d __b)
    130 {
    131   return (__m128d)((__v2df)__a - (__v2df)__b);
    132 }
    133 
    134 /// \brief Multiplies lower double-precision values in both operands and returns
    135 ///    the product in the lower 64 bits of the result. The upper 64 bits of the
    136 ///    result are copied from the upper double-precision value of the first
    137 ///    operand.
    138 ///
    139 /// \headerfile <x86intrin.h>
    140 ///
    141 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
    142 ///
    143 /// \param __a
    144 ///    A 128-bit vector of [2 x double] containing one of the source operands.
    145 /// \param __b
    146 ///    A 128-bit vector of [2 x double] containing one of the source operands.
    147 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
    148 ///    product of the lower 64 bits of both operands. The upper 64 bits are
    149 ///    copied from the upper 64 bits of the first source operand.
    150 static __inline__ __m128d __DEFAULT_FN_ATTRS
    151 _mm_mul_sd(__m128d __a, __m128d __b)
    152 {
    153   __a[0] *= __b[0];
    154   return __a;
    155 }
    156 
    157 /// \brief Multiplies two 128-bit vectors of [2 x double].
    158 ///
    159 /// \headerfile <x86intrin.h>
    160 ///
    161 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
    162 ///
    163 /// \param __a
    164 ///    A 128-bit vector of [2 x double] containing one of the operands.
    165 /// \param __b
    166 ///    A 128-bit vector of [2 x double] containing one of the operands.
    167 /// \returns A 128-bit vector of [2 x double] containing the products of both
    168 ///    operands.
    169 static __inline__ __m128d __DEFAULT_FN_ATTRS
    170 _mm_mul_pd(__m128d __a, __m128d __b)
    171 {
    172   return (__m128d)((__v2df)__a * (__v2df)__b);
    173 }
    174 
    175 /// \brief Divides the lower double-precision value of the first operand by the
    176 ///    lower double-precision value of the second operand and returns the
    177 ///    quotient in the lower 64 bits of the result. The upper 64 bits of the
    178 ///    result are copied from the upper double-precision value of the first
    179 ///    operand.
    180 ///
    181 /// \headerfile <x86intrin.h>
    182 ///
    183 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
    184 ///
    185 /// \param __a
    186 ///    A 128-bit vector of [2 x double] containing the dividend.
    187 /// \param __b
    188 ///    A 128-bit vector of [2 x double] containing divisor.
    189 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
    190 ///    quotient of the lower 64 bits of both operands. The upper 64 bits are
    191 ///    copied from the upper 64 bits of the first source operand.
    192 static __inline__ __m128d __DEFAULT_FN_ATTRS
    193 _mm_div_sd(__m128d __a, __m128d __b)
    194 {
    195   __a[0] /= __b[0];
    196   return __a;
    197 }
    198 
    199 /// \brief Performs an element-by-element division of two 128-bit vectors of
    200 ///    [2 x double].
    201 ///
    202 /// \headerfile <x86intrin.h>
    203 ///
    204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
    205 ///
    206 /// \param __a
    207 ///    A 128-bit vector of [2 x double] containing the dividend.
    208 /// \param __b
    209 ///    A 128-bit vector of [2 x double] containing the divisor.
    210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
    211 ///    operands.
    212 static __inline__ __m128d __DEFAULT_FN_ATTRS
    213 _mm_div_pd(__m128d __a, __m128d __b)
    214 {
    215   return (__m128d)((__v2df)__a / (__v2df)__b);
    216 }
    217 
    218 /// \brief Calculates the square root of the lower double-precision value of
    219 ///    the second operand and returns it in the lower 64 bits of the result.
    220 ///    The upper 64 bits of the result are copied from the upper double-
    221 ///    precision value of the first operand.
    222 ///
    223 /// \headerfile <x86intrin.h>
    224 ///
    225 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
    226 ///
    227 /// \param __a
    228 ///    A 128-bit vector of [2 x double] containing one of the operands. The
    229 ///    upper 64 bits of this operand are copied to the upper 64 bits of the
    230 ///    result.
    231 /// \param __b
    232 ///    A 128-bit vector of [2 x double] containing one of the operands. The
    233 ///    square root is calculated using the lower 64 bits of this operand.
    234 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
    235 ///    square root of the lower 64 bits of operand \a __b, and whose upper 64
    236 ///    bits are copied from the upper 64 bits of operand \a __a.
    237 static __inline__ __m128d __DEFAULT_FN_ATTRS
    238 _mm_sqrt_sd(__m128d __a, __m128d __b)
    239 {
    240   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
    241   return (__m128d) { __c[0], __a[1] };
    242 }
    243 
    244 /// \brief Calculates the square root of the each of two values stored in a
    245 ///    128-bit vector of [2 x double].
    246 ///
    247 /// \headerfile <x86intrin.h>
    248 ///
    249 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
    250 ///
    251 /// \param __a
    252 ///    A 128-bit vector of [2 x double].
    253 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
    254 ///    values in the operand.
    255 static __inline__ __m128d __DEFAULT_FN_ATTRS
    256 _mm_sqrt_pd(__m128d __a)
    257 {
    258   return __builtin_ia32_sqrtpd((__v2df)__a);
    259 }
    260 
    261 /// \brief Compares lower 64-bit double-precision values of both operands, and
    262 ///    returns the lesser of the pair of values in the lower 64-bits of the
    263 ///    result. The upper 64 bits of the result are copied from the upper double-
    264 ///    precision value of the first operand.
    265 ///
    266 /// \headerfile <x86intrin.h>
    267 ///
    268 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
    269 ///
    270 /// \param __a
    271 ///    A 128-bit vector of [2 x double] containing one of the operands. The
    272 ///    lower 64 bits of this operand are used in the comparison.
    273 /// \param __b
    274 ///    A 128-bit vector of [2 x double] containing one of the operands. The
    275 ///    lower 64 bits of this operand are used in the comparison.
    276 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
    277 ///    minimum value between both operands. The upper 64 bits are copied from
    278 ///    the upper 64 bits of the first source operand.
    279 static __inline__ __m128d __DEFAULT_FN_ATTRS
    280 _mm_min_sd(__m128d __a, __m128d __b)
    281 {
    282   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
    283 }
    284 
    285 /// \brief Performs element-by-element comparison of the two 128-bit vectors of
    286 ///    [2 x double] and returns the vector containing the lesser of each pair of
    287 ///    values.
    288 ///
    289 /// \headerfile <x86intrin.h>
    290 ///
    291 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
    292 ///
    293 /// \param __a
    294 ///    A 128-bit vector of [2 x double] containing one of the operands.
    295 /// \param __b
    296 ///    A 128-bit vector of [2 x double] containing one of the operands.
    297 /// \returns A 128-bit vector of [2 x double] containing the minimum values
    298 ///    between both operands.
    299 static __inline__ __m128d __DEFAULT_FN_ATTRS
    300 _mm_min_pd(__m128d __a, __m128d __b)
    301 {
    302   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
    303 }
    304 
    305 /// \brief Compares lower 64-bit double-precision values of both operands, and
    306 ///    returns the greater of the pair of values in the lower 64-bits of the
    307 ///    result. The upper 64 bits of the result are copied from the upper double-
    308 ///    precision value of the first operand.
    309 ///
    310 /// \headerfile <x86intrin.h>
    311 ///
    312 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
    313 ///
    314 /// \param __a
    315 ///    A 128-bit vector of [2 x double] containing one of the operands. The
    316 ///    lower 64 bits of this operand are used in the comparison.
    317 /// \param __b
    318 ///    A 128-bit vector of [2 x double] containing one of the operands. The
    319 ///    lower 64 bits of this operand are used in the comparison.
    320 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
    321 ///    maximum value between both operands. The upper 64 bits are copied from
    322 ///    the upper 64 bits of the first source operand.
    323 static __inline__ __m128d __DEFAULT_FN_ATTRS
    324 _mm_max_sd(__m128d __a, __m128d __b)
    325 {
    326   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
    327 }
    328 
    329 /// \brief Performs element-by-element comparison of the two 128-bit vectors of
    330 ///    [2 x double] and returns the vector containing the greater of each pair
    331 ///    of values.
    332 ///
    333 /// \headerfile <x86intrin.h>
    334 ///
    335 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
    336 ///
    337 /// \param __a
    338 ///    A 128-bit vector of [2 x double] containing one of the operands.
    339 /// \param __b
    340 ///    A 128-bit vector of [2 x double] containing one of the operands.
    341 /// \returns A 128-bit vector of [2 x double] containing the maximum values
    342 ///    between both operands.
    343 static __inline__ __m128d __DEFAULT_FN_ATTRS
    344 _mm_max_pd(__m128d __a, __m128d __b)
    345 {
    346   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
    347 }
    348 
    349 /// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double].
    350 ///
    351 /// \headerfile <x86intrin.h>
    352 ///
    353 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
    354 ///
    355 /// \param __a
    356 ///    A 128-bit vector of [2 x double] containing one of the source operands.
    357 /// \param __b
    358 ///    A 128-bit vector of [2 x double] containing one of the source operands.
    359 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
    360 ///    values between both operands.
    361 static __inline__ __m128d __DEFAULT_FN_ATTRS
    362 _mm_and_pd(__m128d __a, __m128d __b)
    363 {
    364   return (__m128d)((__v2du)__a & (__v2du)__b);
    365 }
    366 
    367 /// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using
    368 ///    the one's complement of the values contained in the first source operand.
    369 ///
    370 /// \headerfile <x86intrin.h>
    371 ///
    372 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
    373 ///
    374 /// \param __a
    375 ///    A 128-bit vector of [2 x double] containing the left source operand. The
    376 ///    one's complement of this value is used in the bitwise AND.
    377 /// \param __b
    378 ///    A 128-bit vector of [2 x double] containing the right source operand.
    379 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
    380 ///    values in the second operand and the one's complement of the first
    381 ///    operand.
    382 static __inline__ __m128d __DEFAULT_FN_ATTRS
    383 _mm_andnot_pd(__m128d __a, __m128d __b)
    384 {
    385   return (__m128d)(~(__v2du)__a & (__v2du)__b);
    386 }
    387 
    388 /// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double].
    389 ///
    390 /// \headerfile <x86intrin.h>
    391 ///
    392 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
    393 ///
    394 /// \param __a
    395 ///    A 128-bit vector of [2 x double] containing one of the source operands.
    396 /// \param __b
    397 ///    A 128-bit vector of [2 x double] containing one of the source operands.
    398 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
    399 ///    values between both operands.
    400 static __inline__ __m128d __DEFAULT_FN_ATTRS
    401 _mm_or_pd(__m128d __a, __m128d __b)
    402 {
    403   return (__m128d)((__v2du)__a | (__v2du)__b);
    404 }
    405 
    406 /// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double].
    407 ///
    408 /// \headerfile <x86intrin.h>
    409 ///
    410 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
    411 ///
    412 /// \param __a
    413 ///    A 128-bit vector of [2 x double] containing one of the source operands.
    414 /// \param __b
    415 ///    A 128-bit vector of [2 x double] containing one of the source operands.
    416 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
    417 ///    values between both operands.
    418 static __inline__ __m128d __DEFAULT_FN_ATTRS
    419 _mm_xor_pd(__m128d __a, __m128d __b)
    420 {
    421   return (__m128d)((__v2du)__a ^ (__v2du)__b);
    422 }
    423 
    424 /// \brief Compares each of the corresponding double-precision values of the
    425 ///    128-bit vectors of [2 x double] for equality. Each comparison yields 0h
    426 ///    for false, FFFFFFFFFFFFFFFFh for true.
    427 ///
    428 /// \headerfile <x86intrin.h>
    429 ///
    430 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
    431 ///
    432 /// \param __a
    433 ///    A 128-bit vector of [2 x double].
    434 /// \param __b
    435 ///    A 128-bit vector of [2 x double].
    436 /// \returns A 128-bit vector containing the comparison results.
    437 static __inline__ __m128d __DEFAULT_FN_ATTRS
    438 _mm_cmpeq_pd(__m128d __a, __m128d __b)
    439 {
    440   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
    441 }
    442 
    443 /// \brief Compares each of the corresponding double-precision values of the
    444 ///    128-bit vectors of [2 x double] to determine if the values in the first
    445 ///    operand are less than those in the second operand. Each comparison
    446 ///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    447 ///
    448 /// \headerfile <x86intrin.h>
    449 ///
    450 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
    451 ///
    452 /// \param __a
    453 ///    A 128-bit vector of [2 x double].
    454 /// \param __b
    455 ///    A 128-bit vector of [2 x double].
    456 /// \returns A 128-bit vector containing the comparison results.
    457 static __inline__ __m128d __DEFAULT_FN_ATTRS
    458 _mm_cmplt_pd(__m128d __a, __m128d __b)
    459 {
    460   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
    461 }
    462 
    463 /// \brief Compares each of the corresponding double-precision values of the
    464 ///    128-bit vectors of [2 x double] to determine if the values in the first
    465 ///    operand are less than or equal to those in the second operand.
    466 ///
    467 ///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    468 ///
    469 /// \headerfile <x86intrin.h>
    470 ///
    471 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
    472 ///
    473 /// \param __a
    474 ///    A 128-bit vector of [2 x double].
    475 /// \param __b
    476 ///    A 128-bit vector of [2 x double].
    477 /// \returns A 128-bit vector containing the comparison results.
    478 static __inline__ __m128d __DEFAULT_FN_ATTRS
    479 _mm_cmple_pd(__m128d __a, __m128d __b)
    480 {
    481   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
    482 }
    483 
    484 /// \brief Compares each of the corresponding double-precision values of the
    485 ///    128-bit vectors of [2 x double] to determine if the values in the first
    486 ///    operand are greater than those in the second operand.
    487 ///
    488 ///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    489 ///
    490 /// \headerfile <x86intrin.h>
    491 ///
    492 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
    493 ///
    494 /// \param __a
    495 ///    A 128-bit vector of [2 x double].
    496 /// \param __b
    497 ///    A 128-bit vector of [2 x double].
    498 /// \returns A 128-bit vector containing the comparison results.
    499 static __inline__ __m128d __DEFAULT_FN_ATTRS
    500 _mm_cmpgt_pd(__m128d __a, __m128d __b)
    501 {
    502   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
    503 }
    504 
    505 /// \brief Compares each of the corresponding double-precision values of the
    506 ///    128-bit vectors of [2 x double] to determine if the values in the first
    507 ///    operand are greater than or equal to those in the second operand.
    508 ///
    509 ///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    510 ///
    511 /// \headerfile <x86intrin.h>
    512 ///
    513 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
    514 ///
    515 /// \param __a
    516 ///    A 128-bit vector of [2 x double].
    517 /// \param __b
    518 ///    A 128-bit vector of [2 x double].
    519 /// \returns A 128-bit vector containing the comparison results.
    520 static __inline__ __m128d __DEFAULT_FN_ATTRS
    521 _mm_cmpge_pd(__m128d __a, __m128d __b)
    522 {
    523   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
    524 }
    525 
    526 /// \brief Compares each of the corresponding double-precision values of the
    527 ///    128-bit vectors of [2 x double] to determine if the values in the first
    528 ///    operand are ordered with respect to those in the second operand.
    529 ///
    530 ///    A pair of double-precision values are "ordered" with respect to each
    531 ///    other if neither value is a NaN. Each comparison yields 0h for false,
    532 ///    FFFFFFFFFFFFFFFFh for true.
    533 ///
    534 /// \headerfile <x86intrin.h>
    535 ///
    536 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
    537 ///
    538 /// \param __a
    539 ///    A 128-bit vector of [2 x double].
    540 /// \param __b
    541 ///    A 128-bit vector of [2 x double].
    542 /// \returns A 128-bit vector containing the comparison results.
    543 static __inline__ __m128d __DEFAULT_FN_ATTRS
    544 _mm_cmpord_pd(__m128d __a, __m128d __b)
    545 {
    546   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
    547 }
    548 
    549 /// \brief Compares each of the corresponding double-precision values of the
    550 ///    128-bit vectors of [2 x double] to determine if the values in the first
    551 ///    operand are unordered with respect to those in the second operand.
    552 ///
    553 ///    A pair of double-precision values are "unordered" with respect to each
    554 ///    other if one or both values are NaN. Each comparison yields 0h for false,
    555 ///    FFFFFFFFFFFFFFFFh for true.
    556 ///
    557 /// \headerfile <x86intrin.h>
    558 ///
    559 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
    560 ///   instruction.
    561 ///
    562 /// \param __a
    563 ///    A 128-bit vector of [2 x double].
    564 /// \param __b
    565 ///    A 128-bit vector of [2 x double].
    566 /// \returns A 128-bit vector containing the comparison results.
    567 static __inline__ __m128d __DEFAULT_FN_ATTRS
    568 _mm_cmpunord_pd(__m128d __a, __m128d __b)
    569 {
    570   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
    571 }
    572 
    573 /// \brief Compares each of the corresponding double-precision values of the
    574 ///    128-bit vectors of [2 x double] to determine if the values in the first
    575 ///    operand are unequal to those in the second operand.
    576 ///
    577 ///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    578 ///
    579 /// \headerfile <x86intrin.h>
    580 ///
    581 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
    582 ///
    583 /// \param __a
    584 ///    A 128-bit vector of [2 x double].
    585 /// \param __b
    586 ///    A 128-bit vector of [2 x double].
    587 /// \returns A 128-bit vector containing the comparison results.
    588 static __inline__ __m128d __DEFAULT_FN_ATTRS
    589 _mm_cmpneq_pd(__m128d __a, __m128d __b)
    590 {
    591   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
    592 }
    593 
    594 /// \brief Compares each of the corresponding double-precision values of the
    595 ///    128-bit vectors of [2 x double] to determine if the values in the first
    596 ///    operand are not less than those in the second operand.
    597 ///
    598 ///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    599 ///
    600 /// \headerfile <x86intrin.h>
    601 ///
    602 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
    603 ///
    604 /// \param __a
    605 ///    A 128-bit vector of [2 x double].
    606 /// \param __b
    607 ///    A 128-bit vector of [2 x double].
    608 /// \returns A 128-bit vector containing the comparison results.
    609 static __inline__ __m128d __DEFAULT_FN_ATTRS
    610 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
    611 {
    612   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
    613 }
    614 
    615 /// \brief Compares each of the corresponding double-precision values of the
    616 ///    128-bit vectors of [2 x double] to determine if the values in the first
    617 ///    operand are not less than or equal to those in the second operand.
    618 ///
    619 ///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    620 ///
    621 /// \headerfile <x86intrin.h>
    622 ///
    623 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
    624 ///
    625 /// \param __a
    626 ///    A 128-bit vector of [2 x double].
    627 /// \param __b
    628 ///    A 128-bit vector of [2 x double].
    629 /// \returns A 128-bit vector containing the comparison results.
    630 static __inline__ __m128d __DEFAULT_FN_ATTRS
    631 _mm_cmpnle_pd(__m128d __a, __m128d __b)
    632 {
    633   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
    634 }
    635 
    636 /// \brief Compares each of the corresponding double-precision values of the
    637 ///    128-bit vectors of [2 x double] to determine if the values in the first
    638 ///    operand are not greater than those in the second operand.
    639 ///
    640 ///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    641 ///
    642 /// \headerfile <x86intrin.h>
    643 ///
    644 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
    645 ///
    646 /// \param __a
    647 ///    A 128-bit vector of [2 x double].
    648 /// \param __b
    649 ///    A 128-bit vector of [2 x double].
    650 /// \returns A 128-bit vector containing the comparison results.
    651 static __inline__ __m128d __DEFAULT_FN_ATTRS
    652 _mm_cmpngt_pd(__m128d __a, __m128d __b)
    653 {
    654   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
    655 }
    656 
    657 /// \brief Compares each of the corresponding double-precision values of the
    658 ///    128-bit vectors of [2 x double] to determine if the values in the first
    659 ///    operand are not greater than or equal to those in the second operand.
    660 ///
    661 ///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    662 ///
    663 /// \headerfile <x86intrin.h>
    664 ///
    665 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
    666 ///
    667 /// \param __a
    668 ///    A 128-bit vector of [2 x double].
    669 /// \param __b
    670 ///    A 128-bit vector of [2 x double].
    671 /// \returns A 128-bit vector containing the comparison results.
    672 static __inline__ __m128d __DEFAULT_FN_ATTRS
    673 _mm_cmpnge_pd(__m128d __a, __m128d __b)
    674 {
    675   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
    676 }
    677 
    678 /// \brief Compares the lower double-precision floating-point values in each of
    679 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
    680 ///
    681 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    682 ///
    683 /// \headerfile <x86intrin.h>
    684 ///
    685 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
    686 ///
    687 /// \param __a
    688 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    689 ///    compared to the lower double-precision value of \a __b.
    690 /// \param __b
    691 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    692 ///    compared to the lower double-precision value of \a __a.
    693 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    694 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    695 static __inline__ __m128d __DEFAULT_FN_ATTRS
    696 _mm_cmpeq_sd(__m128d __a, __m128d __b)
    697 {
    698   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
    699 }
    700 
    701 /// \brief Compares the lower double-precision floating-point values in each of
    702 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    703 ///    the value in the first parameter is less than the corresponding value in
    704 ///    the second parameter.
    705 ///
    706 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    707 ///
    708 /// \headerfile <x86intrin.h>
    709 ///
    710 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
    711 ///
    712 /// \param __a
    713 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    714 ///    compared to the lower double-precision value of \a __b.
    715 /// \param __b
    716 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    717 ///    compared to the lower double-precision value of \a __a.
    718 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    719 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    720 static __inline__ __m128d __DEFAULT_FN_ATTRS
    721 _mm_cmplt_sd(__m128d __a, __m128d __b)
    722 {
    723   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
    724 }
    725 
    726 /// \brief Compares the lower double-precision floating-point values in each of
    727 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    728 ///    the value in the first parameter is less than or equal to the
    729 ///    corresponding value in the second parameter.
    730 ///
    731 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    732 ///
    733 /// \headerfile <x86intrin.h>
    734 ///
    735 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
    736 ///
    737 /// \param __a
    738 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    739 ///    compared to the lower double-precision value of \a __b.
    740 /// \param __b
    741 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    742 ///    compared to the lower double-precision value of \a __a.
    743 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    744 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    745 static __inline__ __m128d __DEFAULT_FN_ATTRS
    746 _mm_cmple_sd(__m128d __a, __m128d __b)
    747 {
    748   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
    749 }
    750 
    751 /// \brief Compares the lower double-precision floating-point values in each of
    752 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    753 ///    the value in the first parameter is greater than the corresponding value
    754 ///    in the second parameter.
    755 ///
    756 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    757 ///
    758 /// \headerfile <x86intrin.h>
    759 ///
    760 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
    761 ///
    762 /// \param __a
    763 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
    764 ///     compared to the lower double-precision value of \a __b.
    765 /// \param __b
    766 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
    767 ///     compared to the lower double-precision value of \a __a.
    768 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    769 ///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    770 static __inline__ __m128d __DEFAULT_FN_ATTRS
    771 _mm_cmpgt_sd(__m128d __a, __m128d __b)
    772 {
    773   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
    774   return (__m128d) { __c[0], __a[1] };
    775 }
    776 
    777 /// \brief Compares the lower double-precision floating-point values in each of
    778 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    779 ///    the value in the first parameter is greater than or equal to the
    780 ///    corresponding value in the second parameter.
    781 ///
    782 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    783 ///
    784 /// \headerfile <x86intrin.h>
    785 ///
    786 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
    787 ///
    788 /// \param __a
    789 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    790 ///    compared to the lower double-precision value of \a __b.
    791 /// \param __b
    792 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    793 ///    compared to the lower double-precision value of \a __a.
    794 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    795 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    796 static __inline__ __m128d __DEFAULT_FN_ATTRS
    797 _mm_cmpge_sd(__m128d __a, __m128d __b)
    798 {
    799   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
    800   return (__m128d) { __c[0], __a[1] };
    801 }
    802 
    803 /// \brief Compares the lower double-precision floating-point values in each of
    804 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    805 ///    the value in the first parameter is "ordered" with respect to the
    806 ///    corresponding value in the second parameter.
    807 ///
    808 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of
    809 ///    double-precision values are "ordered" with respect to each other if
    810 ///    neither value is a NaN.
    811 ///
    812 /// \headerfile <x86intrin.h>
    813 ///
    814 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
    815 ///
    816 /// \param __a
    817 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    818 ///    compared to the lower double-precision value of \a __b.
    819 /// \param __b
    820 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    821 ///    compared to the lower double-precision value of \a __a.
    822 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    823 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    824 static __inline__ __m128d __DEFAULT_FN_ATTRS
    825 _mm_cmpord_sd(__m128d __a, __m128d __b)
    826 {
    827   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
    828 }
    829 
    830 /// \brief Compares the lower double-precision floating-point values in each of
    831 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    832 ///    the value in the first parameter is "unordered" with respect to the
    833 ///    corresponding value in the second parameter.
    834 ///
    835 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of
    836 ///    double-precision values are "unordered" with respect to each other if one
    837 ///    or both values are NaN.
    838 ///
    839 /// \headerfile <x86intrin.h>
    840 ///
    841 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
    842 ///   instruction.
    843 ///
    844 /// \param __a
    845 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    846 ///    compared to the lower double-precision value of \a __b.
    847 /// \param __b
    848 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    849 ///    compared to the lower double-precision value of \a __a.
    850 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    851 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    852 static __inline__ __m128d __DEFAULT_FN_ATTRS
    853 _mm_cmpunord_sd(__m128d __a, __m128d __b)
    854 {
    855   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
    856 }
    857 
    858 /// \brief Compares the lower double-precision floating-point values in each of
    859 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    860 ///    the value in the first parameter is unequal to the corresponding value in
    861 ///    the second parameter.
    862 ///
    863 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    864 ///
    865 /// \headerfile <x86intrin.h>
    866 ///
    867 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
    868 ///
    869 /// \param __a
    870 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    871 ///    compared to the lower double-precision value of \a __b.
    872 /// \param __b
    873 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    874 ///    compared to the lower double-precision value of \a __a.
    875 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    876 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    877 static __inline__ __m128d __DEFAULT_FN_ATTRS
    878 _mm_cmpneq_sd(__m128d __a, __m128d __b)
    879 {
    880   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
    881 }
    882 
    883 /// \brief Compares the lower double-precision floating-point values in each of
    884 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    885 ///    the value in the first parameter is not less than the corresponding
    886 ///    value in the second parameter.
    887 ///
    888 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    889 ///
    890 /// \headerfile <x86intrin.h>
    891 ///
    892 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
    893 ///
    894 /// \param __a
    895 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    896 ///    compared to the lower double-precision value of \a __b.
    897 /// \param __b
    898 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    899 ///    compared to the lower double-precision value of \a __a.
    900 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    901 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    902 static __inline__ __m128d __DEFAULT_FN_ATTRS
    903 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
    904 {
    905   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
    906 }
    907 
    908 /// \brief Compares the lower double-precision floating-point values in each of
    909 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    910 ///    the value in the first parameter is not less than or equal to the
    911 ///    corresponding value in the second parameter.
    912 ///
    913 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    914 ///
    915 /// \headerfile <x86intrin.h>
    916 ///
    917 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
    918 ///
    919 /// \param __a
    920 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    921 ///    compared to the lower double-precision value of \a __b.
    922 /// \param __b
    923 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    924 ///    compared to the lower double-precision value of \a __a.
    925 /// \returns  A 128-bit vector. The lower 64 bits contains the comparison
    926 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    927 static __inline__ __m128d __DEFAULT_FN_ATTRS
    928 _mm_cmpnle_sd(__m128d __a, __m128d __b)
    929 {
    930   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
    931 }
    932 
    933 /// \brief Compares the lower double-precision floating-point values in each of
    934 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    935 ///    the value in the first parameter is not greater than the corresponding
    936 ///    value in the second parameter.
    937 ///
    938 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    939 ///
    940 /// \headerfile <x86intrin.h>
    941 ///
    942 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
    943 ///
    944 /// \param __a
    945 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    946 ///    compared to the lower double-precision value of \a __b.
    947 /// \param __b
    948 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    949 ///    compared to the lower double-precision value of \a __a.
    950 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    951 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    952 static __inline__ __m128d __DEFAULT_FN_ATTRS
    953 _mm_cmpngt_sd(__m128d __a, __m128d __b)
    954 {
    955   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
    956   return (__m128d) { __c[0], __a[1] };
    957 }
    958 
    959 /// \brief Compares the lower double-precision floating-point values in each of
    960 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
    961 ///    the value in the first parameter is not greater than or equal to the
    962 ///    corresponding value in the second parameter.
    963 ///
    964 ///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
    965 ///
    966 /// \headerfile <x86intrin.h>
    967 ///
    968 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
    969 ///
    970 /// \param __a
    971 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    972 ///    compared to the lower double-precision value of \a __b.
    973 /// \param __b
    974 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    975 ///    compared to the lower double-precision value of \a __a.
    976 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
    977 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
    978 static __inline__ __m128d __DEFAULT_FN_ATTRS
    979 _mm_cmpnge_sd(__m128d __a, __m128d __b)
    980 {
    981   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
    982   return (__m128d) { __c[0], __a[1] };
    983 }
    984 
    985 /// \brief Compares the lower double-precision floating-point values in each of
    986 ///    the two 128-bit floating-point vectors of [2 x double] for equality. The
    987 ///    comparison yields 0 for false, 1 for true.
    988 ///
    989 /// \headerfile <x86intrin.h>
    990 ///
    991 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
    992 ///
    993 /// \param __a
    994 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    995 ///    compared to the lower double-precision value of \a __b.
    996 /// \param __b
    997 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
    998 ///    compared to the lower double-precision value of \a __a.
    999 /// \returns An integer containing the comparison results.
   1000 static __inline__ int __DEFAULT_FN_ATTRS
   1001 _mm_comieq_sd(__m128d __a, __m128d __b)
   1002 {
   1003   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
   1004 }
   1005 
   1006 /// \brief Compares the lower double-precision floating-point values in each of
   1007 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
   1008 ///    the value in the first parameter is less than the corresponding value in
   1009 ///    the second parameter.
   1010 ///
   1011 ///    The comparison yields 0 for false, 1 for true.
   1012 ///
   1013 /// \headerfile <x86intrin.h>
   1014 ///
   1015 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
   1016 ///
   1017 /// \param __a
   1018 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1019 ///    compared to the lower double-precision value of \a __b.
   1020 /// \param __b
   1021 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1022 ///    compared to the lower double-precision value of \a __a.
   1023 /// \returns An integer containing the comparison results.
   1024 static __inline__ int __DEFAULT_FN_ATTRS
   1025 _mm_comilt_sd(__m128d __a, __m128d __b)
   1026 {
   1027   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
   1028 }
   1029 
   1030 /// \brief Compares the lower double-precision floating-point values in each of
   1031 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
   1032 ///    the value in the first parameter is less than or equal to the
   1033 ///    corresponding value in the second parameter.
   1034 ///
   1035 ///    The comparison yields 0 for false, 1 for true.
   1036 ///
   1037 /// \headerfile <x86intrin.h>
   1038 ///
   1039 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
   1040 ///
   1041 /// \param __a
   1042 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1043 ///    compared to the lower double-precision value of \a __b.
   1044 /// \param __b
   1045 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
   1046 ///     compared to the lower double-precision value of \a __a.
   1047 /// \returns An integer containing the comparison results.
   1048 static __inline__ int __DEFAULT_FN_ATTRS
   1049 _mm_comile_sd(__m128d __a, __m128d __b)
   1050 {
   1051   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
   1052 }
   1053 
   1054 /// \brief Compares the lower double-precision floating-point values in each of
   1055 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
   1056 ///    the value in the first parameter is greater than the corresponding value
   1057 ///    in the second parameter.
   1058 ///
   1059 ///    The comparison yields 0 for false, 1 for true.
   1060 ///
   1061 /// \headerfile <x86intrin.h>
   1062 ///
   1063 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
   1064 ///
   1065 /// \param __a
   1066 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1067 ///    compared to the lower double-precision value of \a __b.
   1068 /// \param __b
   1069 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1070 ///    compared to the lower double-precision value of \a __a.
   1071 /// \returns An integer containing the comparison results.
   1072 static __inline__ int __DEFAULT_FN_ATTRS
   1073 _mm_comigt_sd(__m128d __a, __m128d __b)
   1074 {
   1075   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
   1076 }
   1077 
   1078 /// \brief Compares the lower double-precision floating-point values in each of
   1079 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
   1080 ///    the value in the first parameter is greater than or equal to the
   1081 ///    corresponding value in the second parameter.
   1082 ///
   1083 ///    The comparison yields 0 for false, 1 for true.
   1084 ///
   1085 /// \headerfile <x86intrin.h>
   1086 ///
   1087 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
   1088 ///
   1089 /// \param __a
   1090 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1091 ///    compared to the lower double-precision value of \a __b.
   1092 /// \param __b
   1093 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1094 ///    compared to the lower double-precision value of \a __a.
   1095 /// \returns An integer containing the comparison results.
   1096 static __inline__ int __DEFAULT_FN_ATTRS
   1097 _mm_comige_sd(__m128d __a, __m128d __b)
   1098 {
   1099   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
   1100 }
   1101 
   1102 /// \brief Compares the lower double-precision floating-point values in each of
   1103 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
   1104 ///    the value in the first parameter is unequal to the corresponding value in
   1105 ///    the second parameter.
   1106 ///
   1107 ///    The comparison yields 0 for false, 1 for true.
   1108 ///
   1109 /// \headerfile <x86intrin.h>
   1110 ///
   1111 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
   1112 ///
   1113 /// \param __a
   1114 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1115 ///    compared to the lower double-precision value of \a __b.
   1116 /// \param __b
   1117 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1118 ///    compared to the lower double-precision value of \a __a.
   1119 /// \returns An integer containing the comparison results.
   1120 static __inline__ int __DEFAULT_FN_ATTRS
   1121 _mm_comineq_sd(__m128d __a, __m128d __b)
   1122 {
   1123   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
   1124 }
   1125 
   1126 /// \brief Compares the lower double-precision floating-point values in each of
   1127 ///    the two 128-bit floating-point vectors of [2 x double] for equality. The
   1128 ///    comparison yields 0 for false, 1 for true.
   1129 ///
   1130 ///    If either of the two lower double-precision values is NaN, 1 is returned.
   1131 ///
   1132 /// \headerfile <x86intrin.h>
   1133 ///
   1134 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
   1135 ///
   1136 /// \param __a
   1137 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1138 ///    compared to the lower double-precision value of \a __b.
   1139 /// \param __b
   1140 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1141 ///    compared to the lower double-precision value of \a __a.
   1142 /// \returns An integer containing the comparison results. If either of the two
   1143 ///    lower double-precision values is NaN, 1 is returned.
   1144 static __inline__ int __DEFAULT_FN_ATTRS
   1145 _mm_ucomieq_sd(__m128d __a, __m128d __b)
   1146 {
   1147   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
   1148 }
   1149 
   1150 /// \brief Compares the lower double-precision floating-point values in each of
   1151 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
   1152 ///    the value in the first parameter is less than the corresponding value in
   1153 ///    the second parameter.
   1154 ///
   1155 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
   1156 ///    double-precision values is NaN, 1 is returned.
   1157 ///
   1158 /// \headerfile <x86intrin.h>
   1159 ///
   1160 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
   1161 ///
   1162 /// \param __a
   1163 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1164 ///    compared to the lower double-precision value of \a __b.
   1165 /// \param __b
   1166 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1167 ///    compared to the lower double-precision value of \a __a.
   1168 /// \returns An integer containing the comparison results. If either of the two
   1169 ///    lower double-precision values is NaN, 1 is returned.
   1170 static __inline__ int __DEFAULT_FN_ATTRS
   1171 _mm_ucomilt_sd(__m128d __a, __m128d __b)
   1172 {
   1173   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
   1174 }
   1175 
   1176 /// \brief Compares the lower double-precision floating-point values in each of
   1177 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
   1178 ///    the value in the first parameter is less than or equal to the
   1179 ///    corresponding value in the second parameter.
   1180 ///
   1181 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
   1182 ///    double-precision values is NaN, 1 is returned.
   1183 ///
   1184 /// \headerfile <x86intrin.h>
   1185 ///
   1186 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
   1187 ///
   1188 /// \param __a
   1189 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1190 ///    compared to the lower double-precision value of \a __b.
   1191 /// \param __b
   1192 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
   1193 ///     compared to the lower double-precision value of \a __a.
   1194 /// \returns An integer containing the comparison results. If either of the two
   1195 ///     lower double-precision values is NaN, 1 is returned.
   1196 static __inline__ int __DEFAULT_FN_ATTRS
   1197 _mm_ucomile_sd(__m128d __a, __m128d __b)
   1198 {
   1199   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
   1200 }
   1201 
   1202 /// \brief Compares the lower double-precision floating-point values in each of
   1203 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
   1204 ///    the value in the first parameter is greater than the corresponding value
   1205 ///    in the second parameter.
   1206 ///
   1207 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
   1208 ///    double-precision values is NaN, 0 is returned.
   1209 ///
   1210 /// \headerfile <x86intrin.h>
   1211 ///
   1212 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
   1213 ///
   1214 /// \param __a
   1215 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1216 ///    compared to the lower double-precision value of \a __b.
   1217 /// \param __b
   1218 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
   1219 ///     compared to the lower double-precision value of \a __a.
   1220 /// \returns An integer containing the comparison results. If either of the two
   1221 ///     lower double-precision values is NaN, 0 is returned.
   1222 static __inline__ int __DEFAULT_FN_ATTRS
   1223 _mm_ucomigt_sd(__m128d __a, __m128d __b)
   1224 {
   1225   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
   1226 }
   1227 
   1228 /// \brief Compares the lower double-precision floating-point values in each of
   1229 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
   1230 ///    the value in the first parameter is greater than or equal to the
   1231 ///    corresponding value in the second parameter.
   1232 ///
   1233 ///    The comparison yields 0 for false, 1 for true.  If either of the two
   1234 ///    lower double-precision values is NaN, 0 is returned.
   1235 ///
   1236 /// \headerfile <x86intrin.h>
   1237 ///
   1238 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
   1239 ///
   1240 /// \param __a
   1241 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1242 ///    compared to the lower double-precision value of \a __b.
   1243 /// \param __b
   1244 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1245 ///    compared to the lower double-precision value of \a __a.
   1246 /// \returns An integer containing the comparison results. If either of the two
   1247 ///    lower double-precision values is NaN, 0 is returned.
   1248 static __inline__ int __DEFAULT_FN_ATTRS
   1249 _mm_ucomige_sd(__m128d __a, __m128d __b)
   1250 {
   1251   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
   1252 }
   1253 
   1254 /// \brief Compares the lower double-precision floating-point values in each of
   1255 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
   1256 ///    the value in the first parameter is unequal to the corresponding value in
   1257 ///    the second parameter.
   1258 ///
   1259 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
   1260 ///    double-precision values is NaN, 0 is returned.
   1261 ///
   1262 /// \headerfile <x86intrin.h>
   1263 ///
   1264 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
   1265 ///
   1266 /// \param __a
   1267 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1268 ///    compared to the lower double-precision value of \a __b.
   1269 /// \param __b
   1270 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
   1271 ///    compared to the lower double-precision value of \a __a.
   1272 /// \returns An integer containing the comparison result. If either of the two
   1273 ///    lower double-precision values is NaN, 0 is returned.
   1274 static __inline__ int __DEFAULT_FN_ATTRS
   1275 _mm_ucomineq_sd(__m128d __a, __m128d __b)
   1276 {
   1277   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
   1278 }
   1279 
   1280 /// \brief Converts the two double-precision floating-point elements of a
   1281 ///    128-bit vector of [2 x double] into two single-precision floating-point
   1282 ///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
   1283 ///    The upper 64 bits of the result vector are set to zero.
   1284 ///
   1285 /// \headerfile <x86intrin.h>
   1286 ///
   1287 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
   1288 ///
   1289 /// \param __a
   1290 ///    A 128-bit vector of [2 x double].
   1291 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
   1292 ///    converted values. The upper 64 bits are set to zero.
   1293 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1294 _mm_cvtpd_ps(__m128d __a)
   1295 {
   1296   return __builtin_ia32_cvtpd2ps((__v2df)__a);
   1297 }
   1298 
   1299 /// \brief Converts the lower two single-precision floating-point elements of a
   1300 ///    128-bit vector of [4 x float] into two double-precision floating-point
   1301 ///    values, returned in a 128-bit vector of [2 x double]. The upper two
   1302 ///    elements of the input vector are unused.
   1303 ///
   1304 /// \headerfile <x86intrin.h>
   1305 ///
   1306 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
   1307 ///
   1308 /// \param __a
   1309 ///    A 128-bit vector of [4 x float]. The lower two single-precision
   1310 ///    floating-point elements are converted to double-precision values. The
   1311 ///    upper two elements are unused.
   1312 /// \returns A 128-bit vector of [2 x double] containing the converted values.
   1313 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1314 _mm_cvtps_pd(__m128 __a)
   1315 {
   1316   return (__m128d) __builtin_convertvector(
   1317       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
   1318 }
   1319 
   1320 /// \brief Converts the lower two integer elements of a 128-bit vector of
   1321 ///    [4 x i32] into two double-precision floating-point values, returned in a
   1322 ///    128-bit vector of [2 x double].
   1323 ///
   1324 ///    The upper two elements of the input vector are unused.
   1325 ///
   1326 /// \headerfile <x86intrin.h>
   1327 ///
   1328 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
   1329 ///
   1330 /// \param __a
   1331 ///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
   1332 ///    converted to double-precision values.
   1333 ///
   1334 ///    The upper two elements are unused.
   1335 /// \returns A 128-bit vector of [2 x double] containing the converted values.
   1336 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1337 _mm_cvtepi32_pd(__m128i __a)
   1338 {
   1339   return (__m128d) __builtin_convertvector(
   1340       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
   1341 }
   1342 
   1343 /// \brief Converts the two double-precision floating-point elements of a
   1344 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
   1345 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
   1346 ///    64 bits of the result vector are set to zero.
   1347 ///
   1348 /// \headerfile <x86intrin.h>
   1349 ///
   1350 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
   1351 ///
   1352 /// \param __a
   1353 ///    A 128-bit vector of [2 x double].
   1354 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
   1355 ///    converted values. The upper 64 bits are set to zero.
   1356 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1357 _mm_cvtpd_epi32(__m128d __a)
   1358 {
   1359   return __builtin_ia32_cvtpd2dq((__v2df)__a);
   1360 }
   1361 
   1362 /// \brief Converts the low-order element of a 128-bit vector of [2 x double]
   1363 ///    into a 32-bit signed integer value.
   1364 ///
   1365 /// \headerfile <x86intrin.h>
   1366 ///
   1367 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
   1368 ///
   1369 /// \param __a
   1370 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
   1371 ///    conversion.
   1372 /// \returns A 32-bit signed integer containing the converted value.
   1373 static __inline__ int __DEFAULT_FN_ATTRS
   1374 _mm_cvtsd_si32(__m128d __a)
   1375 {
   1376   return __builtin_ia32_cvtsd2si((__v2df)__a);
   1377 }
   1378 
   1379 /// \brief Converts the lower double-precision floating-point element of a
   1380 ///    128-bit vector of [2 x double], in the second parameter, into a
   1381 ///    single-precision floating-point value, returned in the lower 32 bits of a
   1382 ///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
   1383 ///    copied from the upper 96 bits of the first parameter.
   1384 ///
   1385 /// \headerfile <x86intrin.h>
   1386 ///
   1387 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
   1388 ///
   1389 /// \param __a
   1390 ///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
   1391 ///    copied to the upper 96 bits of the result.
   1392 /// \param __b
   1393 ///    A 128-bit vector of [2 x double]. The lower double-precision
   1394 ///    floating-point element is used in the conversion.
   1395 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
   1396 ///    converted value from the second parameter. The upper 96 bits are copied
   1397 ///    from the upper 96 bits of the first parameter.
   1398 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1399 _mm_cvtsd_ss(__m128 __a, __m128d __b)
   1400 {
   1401   return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
   1402 }
   1403 
   1404 /// \brief Converts a 32-bit signed integer value, in the second parameter, into
   1405 ///    a double-precision floating-point value, returned in the lower 64 bits of
   1406 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
   1407 ///    are copied from the upper 64 bits of the first parameter.
   1408 ///
   1409 /// \headerfile <x86intrin.h>
   1410 ///
   1411 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
   1412 ///
   1413 /// \param __a
   1414 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
   1415 ///    copied to the upper 64 bits of the result.
   1416 /// \param __b
   1417 ///    A 32-bit signed integer containing the value to be converted.
   1418 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
   1419 ///    converted value from the second parameter. The upper 64 bits are copied
   1420 ///    from the upper 64 bits of the first parameter.
   1421 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1422 _mm_cvtsi32_sd(__m128d __a, int __b)
   1423 {
   1424   __a[0] = __b;
   1425   return __a;
   1426 }
   1427 
   1428 /// \brief Converts the lower single-precision floating-point element of a
   1429 ///    128-bit vector of [4 x float], in the second parameter, into a
   1430 ///    double-precision floating-point value, returned in the lower 64 bits of
   1431 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
   1432 ///    are copied from the upper 64 bits of the first parameter.
   1433 ///
   1434 /// \headerfile <x86intrin.h>
   1435 ///
   1436 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
   1437 ///
   1438 /// \param __a
   1439 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
   1440 ///    copied to the upper 64 bits of the result.
   1441 /// \param __b
   1442 ///    A 128-bit vector of [4 x float]. The lower single-precision
   1443 ///    floating-point element is used in the conversion.
   1444 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
   1445 ///    converted value from the second parameter. The upper 64 bits are copied
   1446 ///    from the upper 64 bits of the first parameter.
   1447 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1448 _mm_cvtss_sd(__m128d __a, __m128 __b)
   1449 {
   1450   __a[0] = __b[0];
   1451   return __a;
   1452 }
   1453 
   1454 /// \brief Converts the two double-precision floating-point elements of a
   1455 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
   1456 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
   1457 ///
   1458 ///    If the result of either conversion is inexact, the result is truncated
   1459 ///    (rounded towards zero) regardless of the current MXCSR setting. The upper
   1460 ///    64 bits of the result vector are set to zero.
   1461 ///
   1462 /// \headerfile <x86intrin.h>
   1463 ///
   1464 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
   1465 ///   instruction.
   1466 ///
   1467 /// \param __a
   1468 ///    A 128-bit vector of [2 x double].
   1469 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
   1470 ///    converted values. The upper 64 bits are set to zero.
   1471 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1472 _mm_cvttpd_epi32(__m128d __a)
   1473 {
   1474   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
   1475 }
   1476 
   1477 /// \brief Converts the low-order element of a [2 x double] vector into a 32-bit
   1478 ///    signed integer value, truncating the result when it is inexact.
   1479 ///
   1480 /// \headerfile <x86intrin.h>
   1481 ///
   1482 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
   1483 ///   instruction.
   1484 ///
   1485 /// \param __a
   1486 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
   1487 ///    conversion.
   1488 /// \returns A 32-bit signed integer containing the converted value.
   1489 static __inline__ int __DEFAULT_FN_ATTRS
   1490 _mm_cvttsd_si32(__m128d __a)
   1491 {
   1492   return __builtin_ia32_cvttsd2si((__v2df)__a);
   1493 }
   1494 
   1495 /// \brief Converts the two double-precision floating-point elements of a
   1496 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
   1497 ///    returned in a 64-bit vector of [2 x i32].
   1498 ///
   1499 /// \headerfile <x86intrin.h>
   1500 ///
   1501 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
   1502 ///
   1503 /// \param __a
   1504 ///    A 128-bit vector of [2 x double].
   1505 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
   1506 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1507 _mm_cvtpd_pi32(__m128d __a)
   1508 {
   1509   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
   1510 }
   1511 
   1512 /// \brief Converts the two double-precision floating-point elements of a
   1513 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
   1514 ///    returned in a 64-bit vector of [2 x i32].
   1515 ///
   1516 ///    If the result of either conversion is inexact, the result is truncated
   1517 ///    (rounded towards zero) regardless of the current MXCSR setting.
   1518 ///
   1519 /// \headerfile <x86intrin.h>
   1520 ///
   1521 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
   1522 ///
   1523 /// \param __a
   1524 ///    A 128-bit vector of [2 x double].
   1525 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
   1526 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1527 _mm_cvttpd_pi32(__m128d __a)
   1528 {
   1529   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
   1530 }
   1531 
   1532 /// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of
   1533 ///    [2 x i32] into two double-precision floating-point values, returned in a
   1534 ///    128-bit vector of [2 x double].
   1535 ///
   1536 /// \headerfile <x86intrin.h>
   1537 ///
   1538 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
   1539 ///
   1540 /// \param __a
   1541 ///    A 64-bit vector of [2 x i32].
   1542 /// \returns A 128-bit vector of [2 x double] containing the converted values.
   1543 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1544 _mm_cvtpi32_pd(__m64 __a)
   1545 {
   1546   return __builtin_ia32_cvtpi2pd((__v2si)__a);
   1547 }
   1548 
   1549 /// \brief Returns the low-order element of a 128-bit vector of [2 x double] as
   1550 ///    a double-precision floating-point value.
   1551 ///
   1552 /// \headerfile <x86intrin.h>
   1553 ///
   1554 /// This intrinsic has no corresponding instruction.
   1555 ///
   1556 /// \param __a
   1557 ///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
   1558 /// \returns A double-precision floating-point value copied from the lower 64
   1559 ///    bits of \a __a.
   1560 static __inline__ double __DEFAULT_FN_ATTRS
   1561 _mm_cvtsd_f64(__m128d __a)
   1562 {
   1563   return __a[0];
   1564 }
   1565 
   1566 /// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned
   1567 ///    memory location.
   1568 ///
   1569 /// \headerfile <x86intrin.h>
   1570 ///
   1571 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
   1572 ///
   1573 /// \param __dp
   1574 ///    A pointer to a 128-bit memory location. The address of the memory
   1575 ///    location has to be 16-byte aligned.
   1576 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
   1577 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1578 _mm_load_pd(double const *__dp)
   1579 {
   1580   return *(__m128d*)__dp;
   1581 }
   1582 
   1583 /// \brief Loads a double-precision floating-point value from a specified memory
   1584 ///    location and duplicates it to both vector elements of a 128-bit vector of
   1585 ///    [2 x double].
   1586 ///
   1587 /// \headerfile <x86intrin.h>
   1588 ///
   1589 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
   1590 ///
   1591 /// \param __dp
   1592 ///    A pointer to a memory location containing a double-precision value.
   1593 /// \returns A 128-bit vector of [2 x double] containing the loaded and
   1594 ///    duplicated values.
   1595 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1596 _mm_load1_pd(double const *__dp)
   1597 {
   1598   struct __mm_load1_pd_struct {
   1599     double __u;
   1600   } __attribute__((__packed__, __may_alias__));
   1601   double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
   1602   return (__m128d){ __u, __u };
   1603 }
   1604 
   1605 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
   1606 
   1607 /// \brief Loads two double-precision values, in reverse order, from an aligned
   1608 ///    memory location into a 128-bit vector of [2 x double].
   1609 ///
   1610 /// \headerfile <x86intrin.h>
   1611 ///
   1612 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
   1613 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
   1614 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
   1615 ///
   1616 /// \param __dp
   1617 ///    A 16-byte aligned pointer to an array of double-precision values to be
   1618 ///    loaded in reverse order.
   1619 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
   1620 ///    values.
   1621 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1622 _mm_loadr_pd(double const *__dp)
   1623 {
   1624   __m128d __u = *(__m128d*)__dp;
   1625   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
   1626 }
   1627 
   1628 /// \brief Loads a 128-bit floating-point vector of [2 x double] from an
   1629 ///    unaligned memory location.
   1630 ///
   1631 /// \headerfile <x86intrin.h>
   1632 ///
   1633 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
   1634 ///
   1635 /// \param __dp
   1636 ///    A pointer to a 128-bit memory location. The address of the memory
   1637 ///    location does not have to be aligned.
   1638 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
   1639 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1640 _mm_loadu_pd(double const *__dp)
   1641 {
   1642   struct __loadu_pd {
   1643     __m128d __v;
   1644   } __attribute__((__packed__, __may_alias__));
   1645   return ((struct __loadu_pd*)__dp)->__v;
   1646 }
   1647 
   1648 /// \brief Loads a 64-bit integer value to the low element of a 128-bit integer
   1649 ///    vector and clears the upper element.
   1650 ///
   1651 /// \headerfile <x86intrin.h>
   1652 ///
   1653 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
   1654 ///
   1655 /// \param __a
   1656 ///    A pointer to a 64-bit memory location. The address of the memory
   1657 ///    location does not have to be aligned.
   1658 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
   1659 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1660 _mm_loadu_si64(void const *__a)
   1661 {
   1662   struct __loadu_si64 {
   1663     long long __v;
   1664   } __attribute__((__packed__, __may_alias__));
   1665   long long __u = ((struct __loadu_si64*)__a)->__v;
   1666   return (__m128i){__u, 0L};
   1667 }
   1668 
   1669 /// \brief Loads a 64-bit double-precision value to the low element of a
   1670 ///    128-bit integer vector and clears the upper element.
   1671 ///
   1672 /// \headerfile <x86intrin.h>
   1673 ///
   1674 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
   1675 ///
   1676 /// \param __dp
   1677 ///    A pointer to a memory location containing a double-precision value.
   1678 ///    The address of the memory location does not have to be aligned.
   1679 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
   1680 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1681 _mm_load_sd(double const *__dp)
   1682 {
   1683   struct __mm_load_sd_struct {
   1684     double __u;
   1685   } __attribute__((__packed__, __may_alias__));
   1686   double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
   1687   return (__m128d){ __u, 0 };
   1688 }
   1689 
   1690 /// \brief Loads a double-precision value into the high-order bits of a 128-bit
   1691 ///    vector of [2 x double]. The low-order bits are copied from the low-order
   1692 ///    bits of the first operand.
   1693 ///
   1694 /// \headerfile <x86intrin.h>
   1695 ///
   1696 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
   1697 ///
   1698 /// \param __a
   1699 ///    A 128-bit vector of [2 x double]. \n
   1700 ///    Bits [63:0] are written to bits [63:0] of the result.
   1701 /// \param __dp
   1702 ///    A pointer to a 64-bit memory location containing a double-precision
   1703 ///    floating-point value that is loaded. The loaded value is written to bits
   1704 ///    [127:64] of the result. The address of the memory location does not have
   1705 ///    to be aligned.
   1706 /// \returns A 128-bit vector of [2 x double] containing the moved values.
   1707 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1708 _mm_loadh_pd(__m128d __a, double const *__dp)
   1709 {
   1710   struct __mm_loadh_pd_struct {
   1711     double __u;
   1712   } __attribute__((__packed__, __may_alias__));
   1713   double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
   1714   return (__m128d){ __a[0], __u };
   1715 }
   1716 
   1717 /// \brief Loads a double-precision value into the low-order bits of a 128-bit
   1718 ///    vector of [2 x double]. The high-order bits are copied from the
   1719 ///    high-order bits of the first operand.
   1720 ///
   1721 /// \headerfile <x86intrin.h>
   1722 ///
   1723 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
   1724 ///
   1725 /// \param __a
   1726 ///    A 128-bit vector of [2 x double]. \n
   1727 ///    Bits [127:64] are written to bits [127:64] of the result.
   1728 /// \param __dp
   1729 ///    A pointer to a 64-bit memory location containing a double-precision
   1730 ///    floating-point value that is loaded. The loaded value is written to bits
   1731 ///    [63:0] of the result. The address of the memory location does not have to
   1732 ///    be aligned.
   1733 /// \returns A 128-bit vector of [2 x double] containing the moved values.
   1734 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1735 _mm_loadl_pd(__m128d __a, double const *__dp)
   1736 {
   1737   struct __mm_loadl_pd_struct {
   1738     double __u;
   1739   } __attribute__((__packed__, __may_alias__));
   1740   double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
   1741   return (__m128d){ __u, __a[1] };
   1742 }
   1743 
   1744 /// \brief Constructs a 128-bit floating-point vector of [2 x double] with
   1745 ///    unspecified content. This could be used as an argument to another
   1746 ///    intrinsic function where the argument is required but the value is not
   1747 ///    actually used.
   1748 ///
   1749 /// \headerfile <x86intrin.h>
   1750 ///
   1751 /// This intrinsic has no corresponding instruction.
   1752 ///
   1753 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
   1754 ///    content.
   1755 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1756 _mm_undefined_pd(void)
   1757 {
   1758   return (__m128d)__builtin_ia32_undef128();
   1759 }
   1760 
   1761 /// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
   1762 ///    64 bits of the vector are initialized with the specified double-precision
   1763 ///    floating-point value. The upper 64 bits are set to zero.
   1764 ///
   1765 /// \headerfile <x86intrin.h>
   1766 ///
   1767 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
   1768 ///
   1769 /// \param __w
   1770 ///    A double-precision floating-point value used to initialize the lower 64
   1771 ///    bits of the result.
   1772 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
   1773 ///    lower 64 bits contain the value of the parameter. The upper 64 bits are
   1774 ///    set to zero.
   1775 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1776 _mm_set_sd(double __w)
   1777 {
   1778   return (__m128d){ __w, 0 };
   1779 }
   1780 
   1781 /// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
   1782 ///    of the two double-precision floating-point vector elements set to the
   1783 ///    specified double-precision floating-point value.
   1784 ///
   1785 /// \headerfile <x86intrin.h>
   1786 ///
   1787 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
   1788 ///
   1789 /// \param __w
   1790 ///    A double-precision floating-point value used to initialize each vector
   1791 ///    element of the result.
   1792 /// \returns An initialized 128-bit floating-point vector of [2 x double].
   1793 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1794 _mm_set1_pd(double __w)
   1795 {
   1796   return (__m128d){ __w, __w };
   1797 }
   1798 
   1799 /// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
   1800 ///    of the two double-precision floating-point vector elements set to the
   1801 ///    specified double-precision floating-point value.
   1802 ///
   1803 /// \headerfile <x86intrin.h>
   1804 ///
   1805 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
   1806 ///
   1807 /// \param __w
   1808 ///    A double-precision floating-point value used to initialize each vector
   1809 ///    element of the result.
   1810 /// \returns An initialized 128-bit floating-point vector of [2 x double].
   1811 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1812 _mm_set_pd1(double __w)
   1813 {
   1814   return _mm_set1_pd(__w);
   1815 }
   1816 
   1817 /// \brief Constructs a 128-bit floating-point vector of [2 x double]
   1818 ///    initialized with the specified double-precision floating-point values.
   1819 ///
   1820 /// \headerfile <x86intrin.h>
   1821 ///
   1822 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
   1823 ///
   1824 /// \param __w
   1825 ///    A double-precision floating-point value used to initialize the upper 64
   1826 ///    bits of the result.
   1827 /// \param __x
   1828 ///    A double-precision floating-point value used to initialize the lower 64
   1829 ///    bits of the result.
   1830 /// \returns An initialized 128-bit floating-point vector of [2 x double].
   1831 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1832 _mm_set_pd(double __w, double __x)
   1833 {
   1834   return (__m128d){ __x, __w };
   1835 }
   1836 
   1837 /// \brief Constructs a 128-bit floating-point vector of [2 x double],
   1838 ///    initialized in reverse order with the specified double-precision
   1839 ///    floating-point values.
   1840 ///
   1841 /// \headerfile <x86intrin.h>
   1842 ///
   1843 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
   1844 ///
   1845 /// \param __w
   1846 ///    A double-precision floating-point value used to initialize the lower 64
   1847 ///    bits of the result.
   1848 /// \param __x
   1849 ///    A double-precision floating-point value used to initialize the upper 64
   1850 ///    bits of the result.
   1851 /// \returns An initialized 128-bit floating-point vector of [2 x double].
   1852 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1853 _mm_setr_pd(double __w, double __x)
   1854 {
   1855   return (__m128d){ __w, __x };
   1856 }
   1857 
   1858 /// \brief Constructs a 128-bit floating-point vector of [2 x double]
   1859 ///    initialized to zero.
   1860 ///
   1861 /// \headerfile <x86intrin.h>
   1862 ///
   1863 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
   1864 ///
   1865 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
   1866 ///    all elements set to zero.
   1867 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1868 _mm_setzero_pd(void)
   1869 {
   1870   return (__m128d){ 0, 0 };
   1871 }
   1872 
   1873 /// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
   1874 ///    64 bits are set to the lower 64 bits of the second parameter. The upper
   1875 ///    64 bits are set to the upper 64 bits of the first parameter.
   1876 ///
   1877 /// \headerfile <x86intrin.h>
   1878 ///
   1879 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
   1880 ///
   1881 /// \param __a
   1882 ///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
   1883 ///    upper 64 bits of the result.
   1884 /// \param __b
   1885 ///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
   1886 ///    lower 64 bits of the result.
   1887 /// \returns A 128-bit vector of [2 x double] containing the moved values.
   1888 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1889 _mm_move_sd(__m128d __a, __m128d __b)
   1890 {
   1891   return (__m128d){ __b[0], __a[1] };
   1892 }
   1893 
   1894 /// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
   1895 ///    memory location.
   1896 ///
   1897 /// \headerfile <x86intrin.h>
   1898 ///
   1899 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
   1900 ///
   1901 /// \param __dp
   1902 ///    A pointer to a 64-bit memory location.
   1903 /// \param __a
   1904 ///    A 128-bit vector of [2 x double] containing the value to be stored.
   1905 static __inline__ void __DEFAULT_FN_ATTRS
   1906 _mm_store_sd(double *__dp, __m128d __a)
   1907 {
   1908   struct __mm_store_sd_struct {
   1909     double __u;
   1910   } __attribute__((__packed__, __may_alias__));
   1911   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
   1912 }
   1913 
   1914 /// \brief Moves packed double-precision values from a 128-bit vector of
   1915 ///    [2 x double] to a memory location.
   1916 ///
   1917 /// \headerfile <x86intrin.h>
   1918 ///
   1919 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
   1920 ///
   1921 /// \param __dp
   1922 ///    A pointer to an aligned memory location that can store two
   1923 ///    double-precision values.
   1924 /// \param __a
   1925 ///    A packed 128-bit vector of [2 x double] containing the values to be
   1926 ///    moved.
   1927 static __inline__ void __DEFAULT_FN_ATTRS
   1928 _mm_store_pd(double *__dp, __m128d __a)
   1929 {
   1930   *(__m128d*)__dp = __a;
   1931 }
   1932 
   1933 /// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
   1934 ///    the upper and lower 64 bits of a memory location.
   1935 ///
   1936 /// \headerfile <x86intrin.h>
   1937 ///
   1938 /// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
   1939 ///
   1940 /// \param __dp
   1941 ///    A pointer to a memory location that can store two double-precision
   1942 ///    values.
   1943 /// \param __a
   1944 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
   1945 ///    of the values in \a dp.
   1946 static __inline__ void __DEFAULT_FN_ATTRS
   1947 _mm_store1_pd(double *__dp, __m128d __a)
   1948 {
   1949   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
   1950   _mm_store_pd(__dp, __a);
   1951 }
   1952 
   1953 /// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
   1954 ///    location.
   1955 ///
   1956 /// \headerfile <x86intrin.h>
   1957 ///
   1958 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
   1959 ///
   1960 /// \param __dp
   1961 ///    A pointer to a 128-bit memory location. The address of the memory
   1962 ///    location has to be 16-byte aligned.
   1963 /// \param __a
   1964 ///    A 128-bit vector of [2 x double] containing the values to be stored.
   1965 static __inline__ void __DEFAULT_FN_ATTRS
   1966 _mm_store_pd1(double *__dp, __m128d __a)
   1967 {
   1968   return _mm_store1_pd(__dp, __a);
   1969 }
   1970 
   1971 /// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory
   1972 ///    location.
   1973 ///
   1974 /// \headerfile <x86intrin.h>
   1975 ///
   1976 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
   1977 ///
   1978 /// \param __dp
   1979 ///    A pointer to a 128-bit memory location. The address of the memory
   1980 ///    location does not have to be aligned.
   1981 /// \param __a
   1982 ///    A 128-bit vector of [2 x double] containing the values to be stored.
   1983 static __inline__ void __DEFAULT_FN_ATTRS
   1984 _mm_storeu_pd(double *__dp, __m128d __a)
   1985 {
   1986   struct __storeu_pd {
   1987     __m128d __v;
   1988   } __attribute__((__packed__, __may_alias__));
   1989   ((struct __storeu_pd*)__dp)->__v = __a;
   1990 }
   1991 
   1992 /// \brief Stores two double-precision values, in reverse order, from a 128-bit
   1993 ///    vector of [2 x double] to a 16-byte aligned memory location.
   1994 ///
   1995 /// \headerfile <x86intrin.h>
   1996 ///
   1997 /// This intrinsic corresponds to a shuffling instruction followed by a
   1998 /// <c> VMOVAPD / MOVAPD </c> instruction.
   1999 ///
   2000 /// \param __dp
   2001 ///    A pointer to a 16-byte aligned memory location that can store two
   2002 ///    double-precision values.
   2003 /// \param __a
   2004 ///    A 128-bit vector of [2 x double] containing the values to be reversed and
   2005 ///    stored.
   2006 static __inline__ void __DEFAULT_FN_ATTRS
   2007 _mm_storer_pd(double *__dp, __m128d __a)
   2008 {
   2009   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
   2010   *(__m128d *)__dp = __a;
   2011 }
   2012 
   2013 /// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
   2014 ///    memory location.
   2015 ///
   2016 /// \headerfile <x86intrin.h>
   2017 ///
   2018 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
   2019 ///
   2020 /// \param __dp
   2021 ///    A pointer to a 64-bit memory location.
   2022 /// \param __a
   2023 ///    A 128-bit vector of [2 x double] containing the value to be stored.
   2024 static __inline__ void __DEFAULT_FN_ATTRS
   2025 _mm_storeh_pd(double *__dp, __m128d __a)
   2026 {
   2027   struct __mm_storeh_pd_struct {
   2028     double __u;
   2029   } __attribute__((__packed__, __may_alias__));
   2030   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
   2031 }
   2032 
   2033 /// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
   2034 ///    memory location.
   2035 ///
   2036 /// \headerfile <x86intrin.h>
   2037 ///
   2038 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
   2039 ///
   2040 /// \param __dp
   2041 ///    A pointer to a 64-bit memory location.
   2042 /// \param __a
   2043 ///    A 128-bit vector of [2 x double] containing the value to be stored.
   2044 static __inline__ void __DEFAULT_FN_ATTRS
   2045 _mm_storel_pd(double *__dp, __m128d __a)
   2046 {
   2047   struct __mm_storeh_pd_struct {
   2048     double __u;
   2049   } __attribute__((__packed__, __may_alias__));
   2050   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
   2051 }
   2052 
   2053 /// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
   2054 ///    saving the lower 8 bits of each sum in the corresponding element of a
   2055 ///    128-bit result vector of [16 x i8].
   2056 ///
   2057 ///    The integer elements of both parameters can be either signed or unsigned.
   2058 ///
   2059 /// \headerfile <x86intrin.h>
   2060 ///
   2061 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
   2062 ///
   2063 /// \param __a
   2064 ///    A 128-bit vector of [16 x i8].
   2065 /// \param __b
   2066 ///    A 128-bit vector of [16 x i8].
   2067 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
   2068 ///    parameters.
   2069 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2070 _mm_add_epi8(__m128i __a, __m128i __b)
   2071 {
   2072   return (__m128i)((__v16qu)__a + (__v16qu)__b);
   2073 }
   2074 
   2075 /// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
   2076 ///    saving the lower 16 bits of each sum in the corresponding element of a
   2077 ///    128-bit result vector of [8 x i16].
   2078 ///
   2079 ///    The integer elements of both parameters can be either signed or unsigned.
   2080 ///
   2081 /// \headerfile <x86intrin.h>
   2082 ///
   2083 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
   2084 ///
   2085 /// \param __a
   2086 ///    A 128-bit vector of [8 x i16].
   2087 /// \param __b
   2088 ///    A 128-bit vector of [8 x i16].
   2089 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
   2090 ///    parameters.
   2091 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2092 _mm_add_epi16(__m128i __a, __m128i __b)
   2093 {
   2094   return (__m128i)((__v8hu)__a + (__v8hu)__b);
   2095 }
   2096 
   2097 /// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
   2098 ///    saving the lower 32 bits of each sum in the corresponding element of a
   2099 ///    128-bit result vector of [4 x i32].
   2100 ///
   2101 ///    The integer elements of both parameters can be either signed or unsigned.
   2102 ///
   2103 /// \headerfile <x86intrin.h>
   2104 ///
   2105 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
   2106 ///
   2107 /// \param __a
   2108 ///    A 128-bit vector of [4 x i32].
   2109 /// \param __b
   2110 ///    A 128-bit vector of [4 x i32].
   2111 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
   2112 ///    parameters.
   2113 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2114 _mm_add_epi32(__m128i __a, __m128i __b)
   2115 {
   2116   return (__m128i)((__v4su)__a + (__v4su)__b);
   2117 }
   2118 
   2119 /// \brief Adds two signed or unsigned 64-bit integer values, returning the
   2120 ///    lower 64 bits of the sum.
   2121 ///
   2122 /// \headerfile <x86intrin.h>
   2123 ///
   2124 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
   2125 ///
   2126 /// \param __a
   2127 ///    A 64-bit integer.
   2128 /// \param __b
   2129 ///    A 64-bit integer.
   2130 /// \returns A 64-bit integer containing the sum of both parameters.
   2131 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2132 _mm_add_si64(__m64 __a, __m64 __b)
   2133 {
   2134   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
   2135 }
   2136 
   2137 /// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
   2138 ///    saving the lower 64 bits of each sum in the corresponding element of a
   2139 ///    128-bit result vector of [2 x i64].
   2140 ///
   2141 ///    The integer elements of both parameters can be either signed or unsigned.
   2142 ///
   2143 /// \headerfile <x86intrin.h>
   2144 ///
   2145 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
   2146 ///
   2147 /// \param __a
   2148 ///    A 128-bit vector of [2 x i64].
   2149 /// \param __b
   2150 ///    A 128-bit vector of [2 x i64].
   2151 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
   2152 ///    parameters.
   2153 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2154 _mm_add_epi64(__m128i __a, __m128i __b)
   2155 {
   2156   return (__m128i)((__v2du)__a + (__v2du)__b);
   2157 }
   2158 
   2159 /// \brief Adds, with saturation, the corresponding elements of two 128-bit
   2160 ///    signed [16 x i8] vectors, saving each sum in the corresponding element of
   2161 ///    a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are
   2162 ///    saturated to 7Fh. Negative sums less than 80h are saturated to 80h.
   2163 ///
   2164 /// \headerfile <x86intrin.h>
   2165 ///
   2166 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
   2167 ///
   2168 /// \param __a
   2169 ///    A 128-bit signed [16 x i8] vector.
   2170 /// \param __b
   2171 ///    A 128-bit signed [16 x i8] vector.
   2172 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
   2173 ///    both parameters.
   2174 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2175 _mm_adds_epi8(__m128i __a, __m128i __b)
   2176 {
   2177   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
   2178 }
   2179 
   2180 /// \brief Adds, with saturation, the corresponding elements of two 128-bit
   2181 ///    signed [8 x i16] vectors, saving each sum in the corresponding element of
   2182 ///    a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh
   2183 ///    are saturated to 7FFFh. Negative sums less than 8000h are saturated to
   2184 ///    8000h.
   2185 ///
   2186 /// \headerfile <x86intrin.h>
   2187 ///
   2188 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
   2189 ///
   2190 /// \param __a
   2191 ///    A 128-bit signed [8 x i16] vector.
   2192 /// \param __b
   2193 ///    A 128-bit signed [8 x i16] vector.
   2194 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
   2195 ///    both parameters.
   2196 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2197 _mm_adds_epi16(__m128i __a, __m128i __b)
   2198 {
   2199   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
   2200 }
   2201 
   2202 /// \brief Adds, with saturation, the corresponding elements of two 128-bit
   2203 ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
   2204 ///    of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh
   2205 ///    are saturated to FFh. Negative sums are saturated to 00h.
   2206 ///
   2207 /// \headerfile <x86intrin.h>
   2208 ///
   2209 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
   2210 ///
   2211 /// \param __a
   2212 ///    A 128-bit unsigned [16 x i8] vector.
   2213 /// \param __b
   2214 ///    A 128-bit unsigned [16 x i8] vector.
   2215 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
   2216 ///    of both parameters.
   2217 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2218 _mm_adds_epu8(__m128i __a, __m128i __b)
   2219 {
   2220   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
   2221 }
   2222 
   2223 /// \brief Adds, with saturation, the corresponding elements of two 128-bit
   2224 ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
   2225 ///    of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh
   2226 ///    are saturated to FFFFh. Negative sums are saturated to 0000h.
   2227 ///
   2228 /// \headerfile <x86intrin.h>
   2229 ///
   2230 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
   2231 ///
   2232 /// \param __a
   2233 ///    A 128-bit unsigned [8 x i16] vector.
   2234 /// \param __b
   2235 ///    A 128-bit unsigned [8 x i16] vector.
   2236 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
   2237 ///    of both parameters.
   2238 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2239 _mm_adds_epu16(__m128i __a, __m128i __b)
   2240 {
   2241   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
   2242 }
   2243 
   2244 /// \brief Computes the rounded avarages of corresponding elements of two
   2245 ///    128-bit unsigned [16 x i8] vectors, saving each result in the
   2246 ///    corresponding element of a 128-bit result vector of [16 x i8].
   2247 ///
   2248 /// \headerfile <x86intrin.h>
   2249 ///
   2250 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
   2251 ///
   2252 /// \param __a
   2253 ///    A 128-bit unsigned [16 x i8] vector.
   2254 /// \param __b
   2255 ///    A 128-bit unsigned [16 x i8] vector.
   2256 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
   2257 ///    averages of both parameters.
   2258 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2259 _mm_avg_epu8(__m128i __a, __m128i __b)
   2260 {
   2261   typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
   2262   return (__m128i)__builtin_convertvector(
   2263                ((__builtin_convertvector((__v16qu)__a, __v16hu) +
   2264                  __builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
   2265                  >> 1, __v16qu);
   2266 }
   2267 
   2268 /// \brief Computes the rounded avarages of corresponding elements of two
   2269 ///    128-bit unsigned [8 x i16] vectors, saving each result in the
   2270 ///    corresponding element of a 128-bit result vector of [8 x i16].
   2271 ///
   2272 /// \headerfile <x86intrin.h>
   2273 ///
   2274 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
   2275 ///
   2276 /// \param __a
   2277 ///    A 128-bit unsigned [8 x i16] vector.
   2278 /// \param __b
   2279 ///    A 128-bit unsigned [8 x i16] vector.
   2280 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
   2281 ///    averages of both parameters.
   2282 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2283 _mm_avg_epu16(__m128i __a, __m128i __b)
   2284 {
   2285   typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
   2286   return (__m128i)__builtin_convertvector(
   2287                ((__builtin_convertvector((__v8hu)__a, __v8su) +
   2288                  __builtin_convertvector((__v8hu)__b, __v8su)) + 1)
   2289                  >> 1, __v8hu);
   2290 }
   2291 
   2292 /// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
   2293 ///    vectors, producing eight intermediate 32-bit signed integer products, and
   2294 ///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
   2295 ///    [4 x i32] vector.
   2296 ///
   2297 ///    For example, bits [15:0] of both parameters are multiplied producing a
   2298 ///    32-bit product, bits [31:16] of both parameters are multiplied producing
   2299 ///    a 32-bit product, and the sum of those two products becomes bits [31:0]
   2300 ///    of the result.
   2301 ///
   2302 /// \headerfile <x86intrin.h>
   2303 ///
   2304 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
   2305 ///
   2306 /// \param __a
   2307 ///    A 128-bit signed [8 x i16] vector.
   2308 /// \param __b
   2309 ///    A 128-bit signed [8 x i16] vector.
   2310 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
   2311 ///    of both parameters.
   2312 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2313 _mm_madd_epi16(__m128i __a, __m128i __b)
   2314 {
   2315   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
   2316 }
   2317 
   2318 /// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
   2319 ///    vectors, saving the greater value from each comparison in the
   2320 ///    corresponding element of a 128-bit result vector of [8 x i16].
   2321 ///
   2322 /// \headerfile <x86intrin.h>
   2323 ///
   2324 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
   2325 ///
   2326 /// \param __a
   2327 ///    A 128-bit signed [8 x i16] vector.
   2328 /// \param __b
   2329 ///    A 128-bit signed [8 x i16] vector.
   2330 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
   2331 ///    each comparison.
   2332 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2333 _mm_max_epi16(__m128i __a, __m128i __b)
   2334 {
   2335   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
   2336 }
   2337 
   2338 /// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
   2339 ///    vectors, saving the greater value from each comparison in the
   2340 ///    corresponding element of a 128-bit result vector of [16 x i8].
   2341 ///
   2342 /// \headerfile <x86intrin.h>
   2343 ///
   2344 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
   2345 ///
   2346 /// \param __a
   2347 ///    A 128-bit unsigned [16 x i8] vector.
   2348 /// \param __b
   2349 ///    A 128-bit unsigned [16 x i8] vector.
   2350 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
   2351 ///    each comparison.
   2352 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2353 _mm_max_epu8(__m128i __a, __m128i __b)
   2354 {
   2355   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
   2356 }
   2357 
   2358 /// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
   2359 ///    vectors, saving the smaller value from each comparison in the
   2360 ///    corresponding element of a 128-bit result vector of [8 x i16].
   2361 ///
   2362 /// \headerfile <x86intrin.h>
   2363 ///
   2364 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
   2365 ///
   2366 /// \param __a
   2367 ///    A 128-bit signed [8 x i16] vector.
   2368 /// \param __b
   2369 ///    A 128-bit signed [8 x i16] vector.
   2370 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
   2371 ///    each comparison.
   2372 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2373 _mm_min_epi16(__m128i __a, __m128i __b)
   2374 {
   2375   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
   2376 }
   2377 
   2378 /// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
   2379 ///    vectors, saving the smaller value from each comparison in the
   2380 ///    corresponding element of a 128-bit result vector of [16 x i8].
   2381 ///
   2382 /// \headerfile <x86intrin.h>
   2383 ///
   2384 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
   2385 ///
   2386 /// \param __a
   2387 ///    A 128-bit unsigned [16 x i8] vector.
   2388 /// \param __b
   2389 ///    A 128-bit unsigned [16 x i8] vector.
   2390 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
   2391 ///    each comparison.
   2392 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2393 _mm_min_epu8(__m128i __a, __m128i __b)
   2394 {
   2395   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
   2396 }
   2397 
   2398 /// \brief Multiplies the corresponding elements of two signed [8 x i16]
   2399 ///    vectors, saving the upper 16 bits of each 32-bit product in the
   2400 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
   2401 ///
   2402 /// \headerfile <x86intrin.h>
   2403 ///
   2404 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
   2405 ///
   2406 /// \param __a
   2407 ///    A 128-bit signed [8 x i16] vector.
   2408 /// \param __b
   2409 ///    A 128-bit signed [8 x i16] vector.
   2410 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
   2411 ///    each of the eight 32-bit products.
   2412 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2413 _mm_mulhi_epi16(__m128i __a, __m128i __b)
   2414 {
   2415   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
   2416 }
   2417 
   2418 /// \brief Multiplies the corresponding elements of two unsigned [8 x i16]
   2419 ///    vectors, saving the upper 16 bits of each 32-bit product in the
   2420 ///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
   2421 ///
   2422 /// \headerfile <x86intrin.h>
   2423 ///
   2424 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
   2425 ///
   2426 /// \param __a
   2427 ///    A 128-bit unsigned [8 x i16] vector.
   2428 /// \param __b
   2429 ///    A 128-bit unsigned [8 x i16] vector.
   2430 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
   2431 ///    of each of the eight 32-bit products.
   2432 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2433 _mm_mulhi_epu16(__m128i __a, __m128i __b)
   2434 {
   2435   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
   2436 }
   2437 
   2438 /// \brief Multiplies the corresponding elements of two signed [8 x i16]
   2439 ///    vectors, saving the lower 16 bits of each 32-bit product in the
   2440 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
   2441 ///
   2442 /// \headerfile <x86intrin.h>
   2443 ///
   2444 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
   2445 ///
   2446 /// \param __a
   2447 ///    A 128-bit signed [8 x i16] vector.
   2448 /// \param __b
   2449 ///    A 128-bit signed [8 x i16] vector.
   2450 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
   2451 ///    each of the eight 32-bit products.
   2452 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2453 _mm_mullo_epi16(__m128i __a, __m128i __b)
   2454 {
   2455   return (__m128i)((__v8hu)__a * (__v8hu)__b);
   2456 }
   2457 
   2458 /// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
   2459 ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
   2460 ///    product.
   2461 ///
   2462 /// \headerfile <x86intrin.h>
   2463 ///
   2464 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
   2465 ///
   2466 /// \param __a
   2467 ///    A 64-bit integer containing one of the source operands.
   2468 /// \param __b
   2469 ///    A 64-bit integer containing one of the source operands.
   2470 /// \returns A 64-bit integer vector containing the product of both operands.
   2471 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2472 _mm_mul_su32(__m64 __a, __m64 __b)
   2473 {
   2474   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
   2475 }
   2476 
   2477 /// \brief Multiplies 32-bit unsigned integer values contained in the lower
   2478 ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
   2479 ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
   2480 ///
   2481 /// \headerfile <x86intrin.h>
   2482 ///
   2483 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
   2484 ///
   2485 /// \param __a
   2486 ///    A [2 x i64] vector containing one of the source operands.
   2487 /// \param __b
   2488 ///    A [2 x i64] vector containing one of the source operands.
   2489 /// \returns A [2 x i64] vector containing the product of both operands.
   2490 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2491 _mm_mul_epu32(__m128i __a, __m128i __b)
   2492 {
   2493   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
   2494 }
   2495 
   2496 /// \brief Computes the absolute differences of corresponding 8-bit integer
   2497 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
   2498 ///    separately sums the second 8 absolute differences. Packs these two
   2499 ///    unsigned 16-bit integer sums into the upper and lower elements of a
   2500 ///    [2 x i64] vector.
   2501 ///
   2502 /// \headerfile <x86intrin.h>
   2503 ///
   2504 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
   2505 ///
   2506 /// \param __a
   2507 ///    A 128-bit integer vector containing one of the source operands.
   2508 /// \param __b
   2509 ///    A 128-bit integer vector containing one of the source operands.
   2510 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
   2511 ///    differences between both operands.
   2512 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2513 _mm_sad_epu8(__m128i __a, __m128i __b)
   2514 {
   2515   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
   2516 }
   2517 
   2518 /// \brief Subtracts the corresponding 8-bit integer values in the operands.
   2519 ///
   2520 /// \headerfile <x86intrin.h>
   2521 ///
   2522 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
   2523 ///
   2524 /// \param __a
   2525 ///    A 128-bit integer vector containing the minuends.
   2526 /// \param __b
   2527 ///    A 128-bit integer vector containing the subtrahends.
   2528 /// \returns A 128-bit integer vector containing the differences of the values
   2529 ///    in the operands.
   2530 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2531 _mm_sub_epi8(__m128i __a, __m128i __b)
   2532 {
   2533   return (__m128i)((__v16qu)__a - (__v16qu)__b);
   2534 }
   2535 
   2536 /// \brief Subtracts the corresponding 16-bit integer values in the operands.
   2537 ///
   2538 /// \headerfile <x86intrin.h>
   2539 ///
   2540 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
   2541 ///
   2542 /// \param __a
   2543 ///    A 128-bit integer vector containing the minuends.
   2544 /// \param __b
   2545 ///    A 128-bit integer vector containing the subtrahends.
   2546 /// \returns A 128-bit integer vector containing the differences of the values
   2547 ///    in the operands.
   2548 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2549 _mm_sub_epi16(__m128i __a, __m128i __b)
   2550 {
   2551   return (__m128i)((__v8hu)__a - (__v8hu)__b);
   2552 }
   2553 
   2554 /// \brief Subtracts the corresponding 32-bit integer values in the operands.
   2555 ///
   2556 /// \headerfile <x86intrin.h>
   2557 ///
   2558 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
   2559 ///
   2560 /// \param __a
   2561 ///    A 128-bit integer vector containing the minuends.
   2562 /// \param __b
   2563 ///    A 128-bit integer vector containing the subtrahends.
   2564 /// \returns A 128-bit integer vector containing the differences of the values
   2565 ///    in the operands.
   2566 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2567 _mm_sub_epi32(__m128i __a, __m128i __b)
   2568 {
   2569   return (__m128i)((__v4su)__a - (__v4su)__b);
   2570 }
   2571 
   2572 /// \brief Subtracts signed or unsigned 64-bit integer values and writes the
   2573 ///    difference to the corresponding bits in the destination.
   2574 ///
   2575 /// \headerfile <x86intrin.h>
   2576 ///
   2577 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
   2578 ///
   2579 /// \param __a
   2580 ///    A 64-bit integer vector containing the minuend.
   2581 /// \param __b
   2582 ///    A 64-bit integer vector containing the subtrahend.
   2583 /// \returns A 64-bit integer vector containing the difference of the values in
   2584 ///    the operands.
   2585 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2586 _mm_sub_si64(__m64 __a, __m64 __b)
   2587 {
   2588   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
   2589 }
   2590 
   2591 /// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
   2592 ///
   2593 /// \headerfile <x86intrin.h>
   2594 ///
   2595 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
   2596 ///
   2597 /// \param __a
   2598 ///    A 128-bit integer vector containing the minuends.
   2599 /// \param __b
   2600 ///    A 128-bit integer vector containing the subtrahends.
   2601 /// \returns A 128-bit integer vector containing the differences of the values
   2602 ///    in the operands.
   2603 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2604 _mm_sub_epi64(__m128i __a, __m128i __b)
   2605 {
   2606   return (__m128i)((__v2du)__a - (__v2du)__b);
   2607 }
   2608 
   2609 /// \brief Subtracts corresponding 8-bit signed integer values in the input and
   2610 ///    returns the differences in the corresponding bytes in the destination.
   2611 ///    Differences greater than 7Fh are saturated to 7Fh, and differences less
   2612 ///    than 80h are saturated to 80h.
   2613 ///
   2614 /// \headerfile <x86intrin.h>
   2615 ///
   2616 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
   2617 ///
   2618 /// \param __a
   2619 ///    A 128-bit integer vector containing the minuends.
   2620 /// \param __b
   2621 ///    A 128-bit integer vector containing the subtrahends.
   2622 /// \returns A 128-bit integer vector containing the differences of the values
   2623 ///    in the operands.
   2624 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2625 _mm_subs_epi8(__m128i __a, __m128i __b)
   2626 {
   2627   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
   2628 }
   2629 
   2630 /// \brief Subtracts corresponding 16-bit signed integer values in the input and
   2631 ///    returns the differences in the corresponding bytes in the destination.
   2632 ///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
   2633 ///    than 8000h are saturated to 8000h.
   2634 ///
   2635 /// \headerfile <x86intrin.h>
   2636 ///
   2637 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
   2638 ///
   2639 /// \param __a
   2640 ///    A 128-bit integer vector containing the minuends.
   2641 /// \param __b
   2642 ///    A 128-bit integer vector containing the subtrahends.
   2643 /// \returns A 128-bit integer vector containing the differences of the values
   2644 ///    in the operands.
   2645 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2646 _mm_subs_epi16(__m128i __a, __m128i __b)
   2647 {
   2648   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
   2649 }
   2650 
   2651 /// \brief Subtracts corresponding 8-bit unsigned integer values in the input
   2652 ///    and returns the differences in the corresponding bytes in the
   2653 ///    destination. Differences less than 00h are saturated to 00h.
   2654 ///
   2655 /// \headerfile <x86intrin.h>
   2656 ///
   2657 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
   2658 ///
   2659 /// \param __a
   2660 ///    A 128-bit integer vector containing the minuends.
   2661 /// \param __b
   2662 ///    A 128-bit integer vector containing the subtrahends.
   2663 /// \returns A 128-bit integer vector containing the unsigned integer
   2664 ///    differences of the values in the operands.
   2665 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2666 _mm_subs_epu8(__m128i __a, __m128i __b)
   2667 {
   2668   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
   2669 }
   2670 
   2671 /// \brief Subtracts corresponding 16-bit unsigned integer values in the input
   2672 ///    and returns the differences in the corresponding bytes in the
   2673 ///    destination. Differences less than 0000h are saturated to 0000h.
   2674 ///
   2675 /// \headerfile <x86intrin.h>
   2676 ///
   2677 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
   2678 ///
   2679 /// \param __a
   2680 ///    A 128-bit integer vector containing the minuends.
   2681 /// \param __b
   2682 ///    A 128-bit integer vector containing the subtrahends.
   2683 /// \returns A 128-bit integer vector containing the unsigned integer
   2684 ///    differences of the values in the operands.
   2685 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2686 _mm_subs_epu16(__m128i __a, __m128i __b)
   2687 {
   2688   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
   2689 }
   2690 
   2691 /// \brief Performs a bitwise AND of two 128-bit integer vectors.
   2692 ///
   2693 /// \headerfile <x86intrin.h>
   2694 ///
   2695 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
   2696 ///
   2697 /// \param __a
   2698 ///    A 128-bit integer vector containing one of the source operands.
   2699 /// \param __b
   2700 ///    A 128-bit integer vector containing one of the source operands.
   2701 /// \returns A 128-bit integer vector containing the bitwise AND of the values
   2702 ///    in both operands.
   2703 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2704 _mm_and_si128(__m128i __a, __m128i __b)
   2705 {
   2706   return (__m128i)((__v2du)__a & (__v2du)__b);
   2707 }
   2708 
   2709 /// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
   2710 ///    one's complement of the values contained in the first source operand.
   2711 ///
   2712 /// \headerfile <x86intrin.h>
   2713 ///
   2714 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
   2715 ///
   2716 /// \param __a
   2717 ///    A 128-bit vector containing the left source operand. The one's complement
   2718 ///    of this value is used in the bitwise AND.
   2719 /// \param __b
   2720 ///    A 128-bit vector containing the right source operand.
   2721 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
   2722 ///    complement of the first operand and the values in the second operand.
   2723 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2724 _mm_andnot_si128(__m128i __a, __m128i __b)
   2725 {
   2726   return (__m128i)(~(__v2du)__a & (__v2du)__b);
   2727 }
   2728 /// \brief Performs a bitwise OR of two 128-bit integer vectors.
   2729 ///
   2730 /// \headerfile <x86intrin.h>
   2731 ///
   2732 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
   2733 ///
   2734 /// \param __a
   2735 ///    A 128-bit integer vector containing one of the source operands.
   2736 /// \param __b
   2737 ///    A 128-bit integer vector containing one of the source operands.
   2738 /// \returns A 128-bit integer vector containing the bitwise OR of the values
   2739 ///    in both operands.
   2740 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2741 _mm_or_si128(__m128i __a, __m128i __b)
   2742 {
   2743   return (__m128i)((__v2du)__a | (__v2du)__b);
   2744 }
   2745 
   2746 /// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
   2747 ///
   2748 /// \headerfile <x86intrin.h>
   2749 ///
   2750 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
   2751 ///
   2752 /// \param __a
   2753 ///    A 128-bit integer vector containing one of the source operands.
   2754 /// \param __b
   2755 ///    A 128-bit integer vector containing one of the source operands.
   2756 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
   2757 ///    values in both operands.
   2758 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2759 _mm_xor_si128(__m128i __a, __m128i __b)
   2760 {
   2761   return (__m128i)((__v2du)__a ^ (__v2du)__b);
   2762 }
   2763 
   2764 /// \brief Left-shifts the 128-bit integer vector operand by the specified
   2765 ///    number of bytes. Low-order bits are cleared.
   2766 ///
   2767 /// \headerfile <x86intrin.h>
   2768 ///
   2769 /// \code
   2770 /// __m128i _mm_slli_si128(__m128i a, const int imm);
   2771 /// \endcode
   2772 ///
   2773 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
   2774 ///
   2775 /// \param a
   2776 ///    A 128-bit integer vector containing the source operand.
   2777 /// \param imm
   2778 ///    An immediate value specifying the number of bytes to left-shift operand
   2779 ///    \a a.
   2780 /// \returns A 128-bit integer vector containing the left-shifted value.
   2781 #define _mm_slli_si128(a, imm) __extension__ ({                              \
   2782   (__m128i)__builtin_shufflevector(                                          \
   2783                                  (__v16qi)_mm_setzero_si128(),               \
   2784                                  (__v16qi)(__m128i)(a),                      \
   2785                                  ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
   2786                                  ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
   2787                                  ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
   2788                                  ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
   2789                                  ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
   2790                                  ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
   2791                                  ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
   2792                                  ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
   2793                                  ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
   2794                                  ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
   2795                                  ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
   2796                                  ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
   2797                                  ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
   2798                                  ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
   2799                                  ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
   2800                                  ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
   2801 
   2802 #define _mm_bslli_si128(a, imm) \
   2803   _mm_slli_si128((a), (imm))
   2804 
   2805 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
   2806 ///    by the specified number of bits. Low-order bits are cleared.
   2807 ///
   2808 /// \headerfile <x86intrin.h>
   2809 ///
   2810 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
   2811 ///
   2812 /// \param __a
   2813 ///    A 128-bit integer vector containing the source operand.
   2814 /// \param __count
   2815 ///    An integer value specifying the number of bits to left-shift each value
   2816 ///    in operand \a __a.
   2817 /// \returns A 128-bit integer vector containing the left-shifted values.
   2818 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2819 _mm_slli_epi16(__m128i __a, int __count)
   2820 {
   2821   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
   2822 }
   2823 
   2824 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
   2825 ///    by the specified number of bits. Low-order bits are cleared.
   2826 ///
   2827 /// \headerfile <x86intrin.h>
   2828 ///
   2829 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
   2830 ///
   2831 /// \param __a
   2832 ///    A 128-bit integer vector containing the source operand.
   2833 /// \param __count
   2834 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   2835 ///    to left-shift each value in operand \a __a.
   2836 /// \returns A 128-bit integer vector containing the left-shifted values.
   2837 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2838 _mm_sll_epi16(__m128i __a, __m128i __count)
   2839 {
   2840   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
   2841 }
   2842 
   2843 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
   2844 ///    by the specified number of bits. Low-order bits are cleared.
   2845 ///
   2846 /// \headerfile <x86intrin.h>
   2847 ///
   2848 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
   2849 ///
   2850 /// \param __a
   2851 ///    A 128-bit integer vector containing the source operand.
   2852 /// \param __count
   2853 ///    An integer value specifying the number of bits to left-shift each value
   2854 ///    in operand \a __a.
   2855 /// \returns A 128-bit integer vector containing the left-shifted values.
   2856 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2857 _mm_slli_epi32(__m128i __a, int __count)
   2858 {
   2859   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
   2860 }
   2861 
   2862 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
   2863 ///    by the specified number of bits. Low-order bits are cleared.
   2864 ///
   2865 /// \headerfile <x86intrin.h>
   2866 ///
   2867 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
   2868 ///
   2869 /// \param __a
   2870 ///    A 128-bit integer vector containing the source operand.
   2871 /// \param __count
   2872 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   2873 ///    to left-shift each value in operand \a __a.
   2874 /// \returns A 128-bit integer vector containing the left-shifted values.
   2875 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2876 _mm_sll_epi32(__m128i __a, __m128i __count)
   2877 {
   2878   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
   2879 }
   2880 
   2881 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
   2882 ///    by the specified number of bits. Low-order bits are cleared.
   2883 ///
   2884 /// \headerfile <x86intrin.h>
   2885 ///
   2886 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
   2887 ///
   2888 /// \param __a
   2889 ///    A 128-bit integer vector containing the source operand.
   2890 /// \param __count
   2891 ///    An integer value specifying the number of bits to left-shift each value
   2892 ///    in operand \a __a.
   2893 /// \returns A 128-bit integer vector containing the left-shifted values.
   2894 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2895 _mm_slli_epi64(__m128i __a, int __count)
   2896 {
   2897   return __builtin_ia32_psllqi128((__v2di)__a, __count);
   2898 }
   2899 
   2900 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
   2901 ///    by the specified number of bits. Low-order bits are cleared.
   2902 ///
   2903 /// \headerfile <x86intrin.h>
   2904 ///
   2905 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
   2906 ///
   2907 /// \param __a
   2908 ///    A 128-bit integer vector containing the source operand.
   2909 /// \param __count
   2910 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   2911 ///    to left-shift each value in operand \a __a.
   2912 /// \returns A 128-bit integer vector containing the left-shifted values.
   2913 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2914 _mm_sll_epi64(__m128i __a, __m128i __count)
   2915 {
   2916   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
   2917 }
   2918 
   2919 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
   2920 ///    by the specified number of bits. High-order bits are filled with the sign
   2921 ///    bit of the initial value.
   2922 ///
   2923 /// \headerfile <x86intrin.h>
   2924 ///
   2925 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
   2926 ///
   2927 /// \param __a
   2928 ///    A 128-bit integer vector containing the source operand.
   2929 /// \param __count
   2930 ///    An integer value specifying the number of bits to right-shift each value
   2931 ///    in operand \a __a.
   2932 /// \returns A 128-bit integer vector containing the right-shifted values.
   2933 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2934 _mm_srai_epi16(__m128i __a, int __count)
   2935 {
   2936   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
   2937 }
   2938 
   2939 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
   2940 ///    by the specified number of bits. High-order bits are filled with the sign
   2941 ///    bit of the initial value.
   2942 ///
   2943 /// \headerfile <x86intrin.h>
   2944 ///
   2945 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
   2946 ///
   2947 /// \param __a
   2948 ///    A 128-bit integer vector containing the source operand.
   2949 /// \param __count
   2950 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   2951 ///    to right-shift each value in operand \a __a.
   2952 /// \returns A 128-bit integer vector containing the right-shifted values.
   2953 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2954 _mm_sra_epi16(__m128i __a, __m128i __count)
   2955 {
   2956   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
   2957 }
   2958 
   2959 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
   2960 ///    by the specified number of bits. High-order bits are filled with the sign
   2961 ///    bit of the initial value.
   2962 ///
   2963 /// \headerfile <x86intrin.h>
   2964 ///
   2965 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
   2966 ///
   2967 /// \param __a
   2968 ///    A 128-bit integer vector containing the source operand.
   2969 /// \param __count
   2970 ///    An integer value specifying the number of bits to right-shift each value
   2971 ///    in operand \a __a.
   2972 /// \returns A 128-bit integer vector containing the right-shifted values.
   2973 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2974 _mm_srai_epi32(__m128i __a, int __count)
   2975 {
   2976   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
   2977 }
   2978 
   2979 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
   2980 ///    by the specified number of bits. High-order bits are filled with the sign
   2981 ///    bit of the initial value.
   2982 ///
   2983 /// \headerfile <x86intrin.h>
   2984 ///
   2985 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
   2986 ///
   2987 /// \param __a
   2988 ///    A 128-bit integer vector containing the source operand.
   2989 /// \param __count
   2990 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   2991 ///    to right-shift each value in operand \a __a.
   2992 /// \returns A 128-bit integer vector containing the right-shifted values.
   2993 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2994 _mm_sra_epi32(__m128i __a, __m128i __count)
   2995 {
   2996   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
   2997 }
   2998 
   2999 /// \brief Right-shifts the 128-bit integer vector operand by the specified
   3000 ///    number of bytes. High-order bits are cleared.
   3001 ///
   3002 /// \headerfile <x86intrin.h>
   3003 ///
   3004 /// \code
   3005 /// __m128i _mm_srli_si128(__m128i a, const int imm);
   3006 /// \endcode
   3007 ///
   3008 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
   3009 ///
   3010 /// \param a
   3011 ///    A 128-bit integer vector containing the source operand.
   3012 /// \param imm
   3013 ///    An immediate value specifying the number of bytes to right-shift operand
   3014 ///    \a a.
   3015 /// \returns A 128-bit integer vector containing the right-shifted value.
   3016 #define _mm_srli_si128(a, imm) __extension__ ({                              \
   3017   (__m128i)__builtin_shufflevector(                                          \
   3018                                  (__v16qi)(__m128i)(a),                      \
   3019                                  (__v16qi)_mm_setzero_si128(),               \
   3020                                  ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
   3021                                  ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
   3022                                  ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
   3023                                  ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
   3024                                  ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
   3025                                  ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
   3026                                  ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
   3027                                  ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
   3028                                  ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
   3029                                  ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
   3030                                  ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
   3031                                  ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
   3032                                  ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
   3033                                  ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
   3034                                  ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
   3035                                  ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
   3036 
   3037 #define _mm_bsrli_si128(a, imm) \
   3038   _mm_srli_si128((a), (imm))
   3039 
   3040 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
   3041 ///    operand by the specified number of bits. High-order bits are cleared.
   3042 ///
   3043 /// \headerfile <x86intrin.h>
   3044 ///
   3045 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
   3046 ///
   3047 /// \param __a
   3048 ///    A 128-bit integer vector containing the source operand.
   3049 /// \param __count
   3050 ///    An integer value specifying the number of bits to right-shift each value
   3051 ///    in operand \a __a.
   3052 /// \returns A 128-bit integer vector containing the right-shifted values.
   3053 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3054 _mm_srli_epi16(__m128i __a, int __count)
   3055 {
   3056   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
   3057 }
   3058 
   3059 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
   3060 ///    operand by the specified number of bits. High-order bits are cleared.
   3061 ///
   3062 /// \headerfile <x86intrin.h>
   3063 ///
   3064 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
   3065 ///
   3066 /// \param __a
   3067 ///    A 128-bit integer vector containing the source operand.
   3068 /// \param __count
   3069 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   3070 ///    to right-shift each value in operand \a __a.
   3071 /// \returns A 128-bit integer vector containing the right-shifted values.
   3072 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3073 _mm_srl_epi16(__m128i __a, __m128i __count)
   3074 {
   3075   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
   3076 }
   3077 
   3078 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
   3079 ///    operand by the specified number of bits. High-order bits are cleared.
   3080 ///
   3081 /// \headerfile <x86intrin.h>
   3082 ///
   3083 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
   3084 ///
   3085 /// \param __a
   3086 ///    A 128-bit integer vector containing the source operand.
   3087 /// \param __count
   3088 ///    An integer value specifying the number of bits to right-shift each value
   3089 ///    in operand \a __a.
   3090 /// \returns A 128-bit integer vector containing the right-shifted values.
   3091 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3092 _mm_srli_epi32(__m128i __a, int __count)
   3093 {
   3094   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
   3095 }
   3096 
   3097 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
   3098 ///    operand by the specified number of bits. High-order bits are cleared.
   3099 ///
   3100 /// \headerfile <x86intrin.h>
   3101 ///
   3102 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
   3103 ///
   3104 /// \param __a
   3105 ///    A 128-bit integer vector containing the source operand.
   3106 /// \param __count
   3107 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   3108 ///    to right-shift each value in operand \a __a.
   3109 /// \returns A 128-bit integer vector containing the right-shifted values.
   3110 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3111 _mm_srl_epi32(__m128i __a, __m128i __count)
   3112 {
   3113   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
   3114 }
   3115 
   3116 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
   3117 ///    operand by the specified number of bits. High-order bits are cleared.
   3118 ///
   3119 /// \headerfile <x86intrin.h>
   3120 ///
   3121 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
   3122 ///
   3123 /// \param __a
   3124 ///    A 128-bit integer vector containing the source operand.
   3125 /// \param __count
   3126 ///    An integer value specifying the number of bits to right-shift each value
   3127 ///    in operand \a __a.
   3128 /// \returns A 128-bit integer vector containing the right-shifted values.
   3129 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3130 _mm_srli_epi64(__m128i __a, int __count)
   3131 {
   3132   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
   3133 }
   3134 
   3135 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
   3136 ///    operand by the specified number of bits. High-order bits are cleared.
   3137 ///
   3138 /// \headerfile <x86intrin.h>
   3139 ///
   3140 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
   3141 ///
   3142 /// \param __a
   3143 ///    A 128-bit integer vector containing the source operand.
   3144 /// \param __count
   3145 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   3146 ///    to right-shift each value in operand \a __a.
   3147 /// \returns A 128-bit integer vector containing the right-shifted values.
   3148 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3149 _mm_srl_epi64(__m128i __a, __m128i __count)
   3150 {
   3151   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
   3152 }
   3153 
   3154 /// \brief Compares each of the corresponding 8-bit values of the 128-bit
   3155 ///    integer vectors for equality. Each comparison yields 0h for false, FFh
   3156 ///    for true.
   3157 ///
   3158 /// \headerfile <x86intrin.h>
   3159 ///
   3160 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
   3161 ///
   3162 /// \param __a
   3163 ///    A 128-bit integer vector.
   3164 /// \param __b
   3165 ///    A 128-bit integer vector.
   3166 /// \returns A 128-bit integer vector containing the comparison results.
   3167 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3168 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
   3169 {
   3170   return (__m128i)((__v16qi)__a == (__v16qi)__b);
   3171 }
   3172 
   3173 /// \brief Compares each of the corresponding 16-bit values of the 128-bit
   3174 ///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
   3175 ///    for true.
   3176 ///
   3177 /// \headerfile <x86intrin.h>
   3178 ///
   3179 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
   3180 ///
   3181 /// \param __a
   3182 ///    A 128-bit integer vector.
   3183 /// \param __b
   3184 ///    A 128-bit integer vector.
   3185 /// \returns A 128-bit integer vector containing the comparison results.
   3186 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3187 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
   3188 {
   3189   return (__m128i)((__v8hi)__a == (__v8hi)__b);
   3190 }
   3191 
   3192 /// \brief Compares each of the corresponding 32-bit values of the 128-bit
   3193 ///    integer vectors for equality. Each comparison yields 0h for false,
   3194 ///    FFFFFFFFh for true.
   3195 ///
   3196 /// \headerfile <x86intrin.h>
   3197 ///
   3198 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
   3199 ///
   3200 /// \param __a
   3201 ///    A 128-bit integer vector.
   3202 /// \param __b
   3203 ///    A 128-bit integer vector.
   3204 /// \returns A 128-bit integer vector containing the comparison results.
   3205 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3206 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
   3207 {
   3208   return (__m128i)((__v4si)__a == (__v4si)__b);
   3209 }
   3210 
   3211 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
   3212 ///    integer vectors to determine if the values in the first operand are
   3213 ///    greater than those in the second operand. Each comparison yields 0h for
   3214 ///    false, FFh for true.
   3215 ///
   3216 /// \headerfile <x86intrin.h>
   3217 ///
   3218 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
   3219 ///
   3220 /// \param __a
   3221 ///    A 128-bit integer vector.
   3222 /// \param __b
   3223 ///    A 128-bit integer vector.
   3224 /// \returns A 128-bit integer vector containing the comparison results.
   3225 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3226 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
   3227 {
   3228   /* This function always performs a signed comparison, but __v16qi is a char
   3229      which may be signed or unsigned, so use __v16qs. */
   3230   return (__m128i)((__v16qs)__a > (__v16qs)__b);
   3231 }
   3232 
   3233 /// \brief Compares each of the corresponding signed 16-bit values of the
   3234 ///    128-bit integer vectors to determine if the values in the first operand
   3235 ///    are greater than those in the second operand.
   3236 ///
   3237 ///    Each comparison yields 0h for false, FFFFh for true.
   3238 ///
   3239 /// \headerfile <x86intrin.h>
   3240 ///
   3241 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
   3242 ///
   3243 /// \param __a
   3244 ///    A 128-bit integer vector.
   3245 /// \param __b
   3246 ///    A 128-bit integer vector.
   3247 /// \returns A 128-bit integer vector containing the comparison results.
   3248 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3249 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
   3250 {
   3251   return (__m128i)((__v8hi)__a > (__v8hi)__b);
   3252 }
   3253 
   3254 /// \brief Compares each of the corresponding signed 32-bit values of the
   3255 ///    128-bit integer vectors to determine if the values in the first operand
   3256 ///    are greater than those in the second operand.
   3257 ///
   3258 ///    Each comparison yields 0h for false, FFFFFFFFh for true.
   3259 ///
   3260 /// \headerfile <x86intrin.h>
   3261 ///
   3262 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
   3263 ///
   3264 /// \param __a
   3265 ///    A 128-bit integer vector.
   3266 /// \param __b
   3267 ///    A 128-bit integer vector.
   3268 /// \returns A 128-bit integer vector containing the comparison results.
   3269 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3270 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
   3271 {
   3272   return (__m128i)((__v4si)__a > (__v4si)__b);
   3273 }
   3274 
   3275 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
   3276 ///    integer vectors to determine if the values in the first operand are less
   3277 ///    than those in the second operand.
   3278 ///
   3279 ///    Each comparison yields 0h for false, FFh for true.
   3280 ///
   3281 /// \headerfile <x86intrin.h>
   3282 ///
   3283 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
   3284 ///
   3285 /// \param __a
   3286 ///    A 128-bit integer vector.
   3287 /// \param __b
   3288 ///    A 128-bit integer vector.
   3289 /// \returns A 128-bit integer vector containing the comparison results.
   3290 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3291 _mm_cmplt_epi8(__m128i __a, __m128i __b)
   3292 {
   3293   return _mm_cmpgt_epi8(__b, __a);
   3294 }
   3295 
   3296 /// \brief Compares each of the corresponding signed 16-bit values of the
   3297 ///    128-bit integer vectors to determine if the values in the first operand
   3298 ///    are less than those in the second operand.
   3299 ///
   3300 ///    Each comparison yields 0h for false, FFFFh for true.
   3301 ///
   3302 /// \headerfile <x86intrin.h>
   3303 ///
   3304 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
   3305 ///
   3306 /// \param __a
   3307 ///    A 128-bit integer vector.
   3308 /// \param __b
   3309 ///    A 128-bit integer vector.
   3310 /// \returns A 128-bit integer vector containing the comparison results.
   3311 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3312 _mm_cmplt_epi16(__m128i __a, __m128i __b)
   3313 {
   3314   return _mm_cmpgt_epi16(__b, __a);
   3315 }
   3316 
   3317 /// \brief Compares each of the corresponding signed 32-bit values of the
   3318 ///    128-bit integer vectors to determine if the values in the first operand
   3319 ///    are less than those in the second operand.
   3320 ///
   3321 ///    Each comparison yields 0h for false, FFFFFFFFh for true.
   3322 ///
   3323 /// \headerfile <x86intrin.h>
   3324 ///
   3325 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
   3326 ///
   3327 /// \param __a
   3328 ///    A 128-bit integer vector.
   3329 /// \param __b
   3330 ///    A 128-bit integer vector.
   3331 /// \returns A 128-bit integer vector containing the comparison results.
   3332 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3333 _mm_cmplt_epi32(__m128i __a, __m128i __b)
   3334 {
   3335   return _mm_cmpgt_epi32(__b, __a);
   3336 }
   3337 
   3338 #ifdef __x86_64__
   3339 /// \brief Converts a 64-bit signed integer value from the second operand into a
   3340 ///    double-precision value and returns it in the lower element of a [2 x
   3341 ///    double] vector; the upper element of the returned vector is copied from
   3342 ///    the upper element of the first operand.
   3343 ///
   3344 /// \headerfile <x86intrin.h>
   3345 ///
   3346 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
   3347 ///
   3348 /// \param __a
   3349 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
   3350 ///    copied to the upper 64 bits of the destination.
   3351 /// \param __b
   3352 ///    A 64-bit signed integer operand containing the value to be converted.
   3353 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
   3354 ///    converted value of the second operand. The upper 64 bits are copied from
   3355 ///    the upper 64 bits of the first operand.
   3356 static __inline__ __m128d __DEFAULT_FN_ATTRS
   3357 _mm_cvtsi64_sd(__m128d __a, long long __b)
   3358 {
   3359   __a[0] = __b;
   3360   return __a;
   3361 }
   3362 
   3363 /// \brief Converts the first (lower) element of a vector of [2 x double] into a
   3364 ///    64-bit signed integer value, according to the current rounding mode.
   3365 ///
   3366 /// \headerfile <x86intrin.h>
   3367 ///
   3368 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
   3369 ///
   3370 /// \param __a
   3371 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
   3372 ///    conversion.
   3373 /// \returns A 64-bit signed integer containing the converted value.
   3374 static __inline__ long long __DEFAULT_FN_ATTRS
   3375 _mm_cvtsd_si64(__m128d __a)
   3376 {
   3377   return __builtin_ia32_cvtsd2si64((__v2df)__a);
   3378 }
   3379 
   3380 /// \brief Converts the first (lower) element of a vector of [2 x double] into a
   3381 ///    64-bit signed integer value, truncating the result when it is inexact.
   3382 ///
   3383 /// \headerfile <x86intrin.h>
   3384 ///
   3385 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
   3386 ///   instruction.
   3387 ///
   3388 /// \param __a
   3389 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
   3390 ///    conversion.
   3391 /// \returns A 64-bit signed integer containing the converted value.
   3392 static __inline__ long long __DEFAULT_FN_ATTRS
   3393 _mm_cvttsd_si64(__m128d __a)
   3394 {
   3395   return __builtin_ia32_cvttsd2si64((__v2df)__a);
   3396 }
   3397 #endif
   3398 
   3399 /// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
   3400 ///
   3401 /// \headerfile <x86intrin.h>
   3402 ///
   3403 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
   3404 ///
   3405 /// \param __a
   3406 ///    A 128-bit integer vector.
   3407 /// \returns A 128-bit vector of [4 x float] containing the converted values.
   3408 static __inline__ __m128 __DEFAULT_FN_ATTRS
   3409 _mm_cvtepi32_ps(__m128i __a)
   3410 {
   3411   return __builtin_ia32_cvtdq2ps((__v4si)__a);
   3412 }
   3413 
   3414 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
   3415 ///
   3416 /// \headerfile <x86intrin.h>
   3417 ///
   3418 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
   3419 ///
   3420 /// \param __a
   3421 ///    A 128-bit vector of [4 x float].
   3422 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
   3423 ///    values.
   3424 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3425 _mm_cvtps_epi32(__m128 __a)
   3426 {
   3427   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
   3428 }
   3429 
   3430 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
   3431 ///    truncating the result when it is inexact.
   3432 ///
   3433 /// \headerfile <x86intrin.h>
   3434 ///
   3435 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
   3436 ///   instruction.
   3437 ///
   3438 /// \param __a
   3439 ///    A 128-bit vector of [4 x float].
   3440 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
   3441 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3442 _mm_cvttps_epi32(__m128 __a)
   3443 {
   3444   return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
   3445 }
   3446 
   3447 /// \brief Returns a vector of [4 x i32] where the lowest element is the input
   3448 ///    operand and the remaining elements are zero.
   3449 ///
   3450 /// \headerfile <x86intrin.h>
   3451 ///
   3452 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
   3453 ///
   3454 /// \param __a
   3455 ///    A 32-bit signed integer operand.
   3456 /// \returns A 128-bit vector of [4 x i32].
   3457 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3458 _mm_cvtsi32_si128(int __a)
   3459 {
   3460   return (__m128i)(__v4si){ __a, 0, 0, 0 };
   3461 }
   3462 
   3463 #ifdef __x86_64__
   3464 /// \brief Returns a vector of [2 x i64] where the lower element is the input
   3465 ///    operand and the upper element is zero.
   3466 ///
   3467 /// \headerfile <x86intrin.h>
   3468 ///
   3469 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
   3470 ///
   3471 /// \param __a
   3472 ///    A 64-bit signed integer operand containing the value to be converted.
   3473 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
   3474 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3475 _mm_cvtsi64_si128(long long __a)
   3476 {
   3477   return (__m128i){ __a, 0 };
   3478 }
   3479 #endif
   3480 
   3481 /// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
   3482 ///    32-bit signed integer value.
   3483 ///
   3484 /// \headerfile <x86intrin.h>
   3485 ///
   3486 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
   3487 ///
   3488 /// \param __a
   3489 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
   3490 ///    destination.
   3491 /// \returns A 32-bit signed integer containing the moved value.
   3492 static __inline__ int __DEFAULT_FN_ATTRS
   3493 _mm_cvtsi128_si32(__m128i __a)
   3494 {
   3495   __v4si __b = (__v4si)__a;
   3496   return __b[0];
   3497 }
   3498 
   3499 #ifdef __x86_64__
   3500 /// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
   3501 ///    64-bit signed integer value.
   3502 ///
   3503 /// \headerfile <x86intrin.h>
   3504 ///
   3505 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
   3506 ///
   3507 /// \param __a
   3508 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
   3509 ///    destination.
   3510 /// \returns A 64-bit signed integer containing the moved value.
   3511 static __inline__ long long __DEFAULT_FN_ATTRS
   3512 _mm_cvtsi128_si64(__m128i __a)
   3513 {
   3514   return __a[0];
   3515 }
   3516 #endif
   3517 
   3518 /// \brief Moves packed integer values from an aligned 128-bit memory location
   3519 ///    to elements in a 128-bit integer vector.
   3520 ///
   3521 /// \headerfile <x86intrin.h>
   3522 ///
   3523 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
   3524 ///
   3525 /// \param __p
   3526 ///    An aligned pointer to a memory location containing integer values.
   3527 /// \returns A 128-bit integer vector containing the moved values.
   3528 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3529 _mm_load_si128(__m128i const *__p)
   3530 {
   3531   return *__p;
   3532 }
   3533 
   3534 /// \brief Moves packed integer values from an unaligned 128-bit memory location
   3535 ///    to elements in a 128-bit integer vector.
   3536 ///
   3537 /// \headerfile <x86intrin.h>
   3538 ///
   3539 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
   3540 ///
   3541 /// \param __p
   3542 ///    A pointer to a memory location containing integer values.
   3543 /// \returns A 128-bit integer vector containing the moved values.
   3544 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3545 _mm_loadu_si128(__m128i const *__p)
   3546 {
   3547   struct __loadu_si128 {
   3548     __m128i __v;
   3549   } __attribute__((__packed__, __may_alias__));
   3550   return ((struct __loadu_si128*)__p)->__v;
   3551 }
   3552 
   3553 /// \brief Returns a vector of [2 x i64] where the lower element is taken from
   3554 ///    the lower element of the operand, and the upper element is zero.
   3555 ///
   3556 /// \headerfile <x86intrin.h>
   3557 ///
   3558 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
   3559 ///
   3560 /// \param __p
   3561 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
   3562 ///    the destination.
   3563 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
   3564 ///    moved value. The higher order bits are cleared.
   3565 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3566 _mm_loadl_epi64(__m128i const *__p)
   3567 {
   3568   struct __mm_loadl_epi64_struct {
   3569     long long __u;
   3570   } __attribute__((__packed__, __may_alias__));
   3571   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
   3572 }
   3573 
   3574 /// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
   3575 ///    This could be used as an argument to another intrinsic function where the
   3576 ///    argument is required but the value is not actually used.
   3577 ///
   3578 /// \headerfile <x86intrin.h>
   3579 ///
   3580 /// This intrinsic has no corresponding instruction.
   3581 ///
   3582 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
   3583 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3584 _mm_undefined_si128(void)
   3585 {
   3586   return (__m128i)__builtin_ia32_undef128();
   3587 }
   3588 
   3589 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
   3590 ///    the specified 64-bit integer values.
   3591 ///
   3592 /// \headerfile <x86intrin.h>
   3593 ///
   3594 /// This intrinsic is a utility function and does not correspond to a specific
   3595 ///    instruction.
   3596 ///
   3597 /// \param __q1
   3598 ///    A 64-bit integer value used to initialize the upper 64 bits of the
   3599 ///    destination vector of [2 x i64].
   3600 /// \param __q0
   3601 ///    A 64-bit integer value used to initialize the lower 64 bits of the
   3602 ///    destination vector of [2 x i64].
   3603 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
   3604 ///    provided in the operands.
   3605 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3606 _mm_set_epi64x(long long __q1, long long __q0)
   3607 {
   3608   return (__m128i){ __q0, __q1 };
   3609 }
   3610 
   3611 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
   3612 ///    the specified 64-bit integer values.
   3613 ///
   3614 /// \headerfile <x86intrin.h>
   3615 ///
   3616 /// This intrinsic is a utility function and does not correspond to a specific
   3617 ///    instruction.
   3618 ///
   3619 /// \param __q1
   3620 ///    A 64-bit integer value used to initialize the upper 64 bits of the
   3621 ///    destination vector of [2 x i64].
   3622 /// \param __q0
   3623 ///    A 64-bit integer value used to initialize the lower 64 bits of the
   3624 ///    destination vector of [2 x i64].
   3625 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
   3626 ///    provided in the operands.
   3627 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3628 _mm_set_epi64(__m64 __q1, __m64 __q0)
   3629 {
   3630   return (__m128i){ (long long)__q0, (long long)__q1 };
   3631 }
   3632 
   3633 /// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
   3634 ///    the specified 32-bit integer values.
   3635 ///
   3636 /// \headerfile <x86intrin.h>
   3637 ///
   3638 /// This intrinsic is a utility function and does not correspond to a specific
   3639 ///    instruction.
   3640 ///
   3641 /// \param __i3
   3642 ///    A 32-bit integer value used to initialize bits [127:96] of the
   3643 ///    destination vector.
   3644 /// \param __i2
   3645 ///    A 32-bit integer value used to initialize bits [95:64] of the destination
   3646 ///    vector.
   3647 /// \param __i1
   3648 ///    A 32-bit integer value used to initialize bits [63:32] of the destination
   3649 ///    vector.
   3650 /// \param __i0
   3651 ///    A 32-bit integer value used to initialize bits [31:0] of the destination
   3652 ///    vector.
   3653 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
   3654 ///    provided in the operands.
   3655 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3656 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
   3657 {
   3658   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
   3659 }
   3660 
   3661 /// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
   3662 ///    the specified 16-bit integer values.
   3663 ///
   3664 /// \headerfile <x86intrin.h>
   3665 ///
   3666 /// This intrinsic is a utility function and does not correspond to a specific
   3667 ///    instruction.
   3668 ///
   3669 /// \param __w7
   3670 ///    A 16-bit integer value used to initialize bits [127:112] of the
   3671 ///    destination vector.
   3672 /// \param __w6
   3673 ///    A 16-bit integer value used to initialize bits [111:96] of the
   3674 ///    destination vector.
   3675 /// \param __w5
   3676 ///    A 16-bit integer value used to initialize bits [95:80] of the destination
   3677 ///    vector.
   3678 /// \param __w4
   3679 ///    A 16-bit integer value used to initialize bits [79:64] of the destination
   3680 ///    vector.
   3681 /// \param __w3
   3682 ///    A 16-bit integer value used to initialize bits [63:48] of the destination
   3683 ///    vector.
   3684 /// \param __w2
   3685 ///    A 16-bit integer value used to initialize bits [47:32] of the destination
   3686 ///    vector.
   3687 /// \param __w1
   3688 ///    A 16-bit integer value used to initialize bits [31:16] of the destination
   3689 ///    vector.
   3690 /// \param __w0
   3691 ///    A 16-bit integer value used to initialize bits [15:0] of the destination
   3692 ///    vector.
   3693 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
   3694 ///    provided in the operands.
   3695 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3696 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
   3697 {
   3698   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
   3699 }
   3700 
   3701 /// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
   3702 ///    the specified 8-bit integer values.
   3703 ///
   3704 /// \headerfile <x86intrin.h>
   3705 ///
   3706 /// This intrinsic is a utility function and does not correspond to a specific
   3707 ///    instruction.
   3708 ///
   3709 /// \param __b15
   3710 ///    Initializes bits [127:120] of the destination vector.
   3711 /// \param __b14
   3712 ///    Initializes bits [119:112] of the destination vector.
   3713 /// \param __b13
   3714 ///    Initializes bits [111:104] of the destination vector.
   3715 /// \param __b12
   3716 ///    Initializes bits [103:96] of the destination vector.
   3717 /// \param __b11
   3718 ///    Initializes bits [95:88] of the destination vector.
   3719 /// \param __b10
   3720 ///    Initializes bits [87:80] of the destination vector.
   3721 /// \param __b9
   3722 ///    Initializes bits [79:72] of the destination vector.
   3723 /// \param __b8
   3724 ///    Initializes bits [71:64] of the destination vector.
   3725 /// \param __b7
   3726 ///    Initializes bits [63:56] of the destination vector.
   3727 /// \param __b6
   3728 ///    Initializes bits [55:48] of the destination vector.
   3729 /// \param __b5
   3730 ///    Initializes bits [47:40] of the destination vector.
   3731 /// \param __b4
   3732 ///    Initializes bits [39:32] of the destination vector.
   3733 /// \param __b3
   3734 ///    Initializes bits [31:24] of the destination vector.
   3735 /// \param __b2
   3736 ///    Initializes bits [23:16] of the destination vector.
   3737 /// \param __b1
   3738 ///    Initializes bits [15:8] of the destination vector.
   3739 /// \param __b0
   3740 ///    Initializes bits [7:0] of the destination vector.
   3741 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
   3742 ///    provided in the operands.
   3743 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3744 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
   3745 {
   3746   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
   3747 }
   3748 
   3749 /// \brief Initializes both values in a 128-bit integer vector with the
   3750 ///    specified 64-bit integer value.
   3751 ///
   3752 /// \headerfile <x86intrin.h>
   3753 ///
   3754 /// This intrinsic is a utility function and does not correspond to a specific
   3755 ///    instruction.
   3756 ///
   3757 /// \param __q
   3758 ///    Integer value used to initialize the elements of the destination integer
   3759 ///    vector.
   3760 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
   3761 ///    elements containing the value provided in the operand.
   3762 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3763 _mm_set1_epi64x(long long __q)
   3764 {
   3765   return (__m128i){ __q, __q };
   3766 }
   3767 
   3768 /// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
   3769 ///    specified 64-bit value.
   3770 ///
   3771 /// \headerfile <x86intrin.h>
   3772 ///
   3773 /// This intrinsic is a utility function and does not correspond to a specific
   3774 ///    instruction.
   3775 ///
   3776 /// \param __q
   3777 ///    A 64-bit value used to initialize the elements of the destination integer
   3778 ///    vector.
   3779 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
   3780 ///    containing the value provided in the operand.
   3781 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3782 _mm_set1_epi64(__m64 __q)
   3783 {
   3784   return (__m128i){ (long long)__q, (long long)__q };
   3785 }
   3786 
   3787 /// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
   3788 ///    specified 32-bit value.
   3789 ///
   3790 /// \headerfile <x86intrin.h>
   3791 ///
   3792 /// This intrinsic is a utility function and does not correspond to a specific
   3793 ///    instruction.
   3794 ///
   3795 /// \param __i
   3796 ///    A 32-bit value used to initialize the elements of the destination integer
   3797 ///    vector.
   3798 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
   3799 ///    containing the value provided in the operand.
   3800 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3801 _mm_set1_epi32(int __i)
   3802 {
   3803   return (__m128i)(__v4si){ __i, __i, __i, __i };
   3804 }
   3805 
   3806 /// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
   3807 ///    specified 16-bit value.
   3808 ///
   3809 /// \headerfile <x86intrin.h>
   3810 ///
   3811 /// This intrinsic is a utility function and does not correspond to a specific
   3812 ///    instruction.
   3813 ///
   3814 /// \param __w
   3815 ///    A 16-bit value used to initialize the elements of the destination integer
   3816 ///    vector.
   3817 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
   3818 ///    containing the value provided in the operand.
   3819 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3820 _mm_set1_epi16(short __w)
   3821 {
   3822   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
   3823 }
   3824 
   3825 /// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
   3826 ///    specified 8-bit value.
   3827 ///
   3828 /// \headerfile <x86intrin.h>
   3829 ///
   3830 /// This intrinsic is a utility function and does not correspond to a specific
   3831 ///    instruction.
   3832 ///
   3833 /// \param __b
   3834 ///    An 8-bit value used to initialize the elements of the destination integer
   3835 ///    vector.
   3836 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
   3837 ///    containing the value provided in the operand.
   3838 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3839 _mm_set1_epi8(char __b)
   3840 {
   3841   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
   3842 }
   3843 
   3844 /// \brief Constructs a 128-bit integer vector, initialized in reverse order
   3845 ///     with the specified 64-bit integral values.
   3846 ///
   3847 /// \headerfile <x86intrin.h>
   3848 ///
   3849 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
   3850 ///   instruction.
   3851 ///
   3852 /// \param __q0
   3853 ///    A 64-bit integral value used to initialize the lower 64 bits of the
   3854 ///    result.
   3855 /// \param __q1
   3856 ///    A 64-bit integral value used to initialize the upper 64 bits of the
   3857 ///    result.
   3858 /// \returns An initialized 128-bit integer vector.
   3859 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3860 _mm_setr_epi64(__m64 __q0, __m64 __q1)
   3861 {
   3862   return (__m128i){ (long long)__q0, (long long)__q1 };
   3863 }
   3864 
   3865 /// \brief Constructs a 128-bit integer vector, initialized in reverse order
   3866 ///     with the specified 32-bit integral values.
   3867 ///
   3868 /// \headerfile <x86intrin.h>
   3869 ///
   3870 /// This intrinsic is a utility function and does not correspond to a specific
   3871 ///    instruction.
   3872 ///
   3873 /// \param __i0
   3874 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
   3875 /// \param __i1
   3876 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
   3877 /// \param __i2
   3878 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
   3879 /// \param __i3
   3880 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
   3881 /// \returns An initialized 128-bit integer vector.
   3882 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3883 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
   3884 {
   3885   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
   3886 }
   3887 
   3888 /// \brief Constructs a 128-bit integer vector, initialized in reverse order
   3889 ///     with the specified 16-bit integral values.
   3890 ///
   3891 /// \headerfile <x86intrin.h>
   3892 ///
   3893 /// This intrinsic is a utility function and does not correspond to a specific
   3894 ///    instruction.
   3895 ///
   3896 /// \param __w0
   3897 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
   3898 /// \param __w1
   3899 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
   3900 /// \param __w2
   3901 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
   3902 /// \param __w3
   3903 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
   3904 /// \param __w4
   3905 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
   3906 /// \param __w5
   3907 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
   3908 /// \param __w6
   3909 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
   3910 /// \param __w7
   3911 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
   3912 /// \returns An initialized 128-bit integer vector.
   3913 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3914 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
   3915 {
   3916   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
   3917 }
   3918 
   3919 /// \brief Constructs a 128-bit integer vector, initialized in reverse order
   3920 ///     with the specified 8-bit integral values.
   3921 ///
   3922 /// \headerfile <x86intrin.h>
   3923 ///
   3924 /// This intrinsic is a utility function and does not correspond to a specific
   3925 ///    instruction.
   3926 ///
   3927 /// \param __b0
   3928 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
   3929 /// \param __b1
   3930 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
   3931 /// \param __b2
   3932 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
   3933 /// \param __b3
   3934 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
   3935 /// \param __b4
   3936 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
   3937 /// \param __b5
   3938 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
   3939 /// \param __b6
   3940 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
   3941 /// \param __b7
   3942 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
   3943 /// \param __b8
   3944 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
   3945 /// \param __b9
   3946 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
   3947 /// \param __b10
   3948 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
   3949 /// \param __b11
   3950 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
   3951 /// \param __b12
   3952 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
   3953 /// \param __b13
   3954 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
   3955 /// \param __b14
   3956 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
   3957 /// \param __b15
   3958 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
   3959 /// \returns An initialized 128-bit integer vector.
   3960 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3961 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
   3962 {
   3963   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
   3964 }
   3965 
   3966 /// \brief Creates a 128-bit integer vector initialized to zero.
   3967 ///
   3968 /// \headerfile <x86intrin.h>
   3969 ///
   3970 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
   3971 ///
   3972 /// \returns An initialized 128-bit integer vector with all elements set to
   3973 ///    zero.
   3974 static __inline__ __m128i __DEFAULT_FN_ATTRS
   3975 _mm_setzero_si128(void)
   3976 {
   3977   return (__m128i){ 0LL, 0LL };
   3978 }
   3979 
   3980 /// \brief Stores a 128-bit integer vector to a memory location aligned on a
   3981 ///    128-bit boundary.
   3982 ///
   3983 /// \headerfile <x86intrin.h>
   3984 ///
   3985 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
   3986 ///
   3987 /// \param __p
   3988 ///    A pointer to an aligned memory location that will receive the integer
   3989 ///    values.
   3990 /// \param __b
   3991 ///    A 128-bit integer vector containing the values to be moved.
   3992 static __inline__ void __DEFAULT_FN_ATTRS
   3993 _mm_store_si128(__m128i *__p, __m128i __b)
   3994 {
   3995   *__p = __b;
   3996 }
   3997 
   3998 /// \brief Stores a 128-bit integer vector to an unaligned memory location.
   3999 ///
   4000 /// \headerfile <x86intrin.h>
   4001 ///
   4002 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
   4003 ///
   4004 /// \param __p
   4005 ///    A pointer to a memory location that will receive the integer values.
   4006 /// \param __b
   4007 ///    A 128-bit integer vector containing the values to be moved.
   4008 static __inline__ void __DEFAULT_FN_ATTRS
   4009 _mm_storeu_si128(__m128i *__p, __m128i __b)
   4010 {
   4011   struct __storeu_si128 {
   4012     __m128i __v;
   4013   } __attribute__((__packed__, __may_alias__));
   4014   ((struct __storeu_si128*)__p)->__v = __b;
   4015 }
   4016 
   4017 /// \brief Moves bytes selected by the mask from the first operand to the
   4018 ///    specified unaligned memory location. When a mask bit is 1, the
   4019 ///    corresponding byte is written, otherwise it is not written.
   4020 ///
   4021 ///    To minimize caching, the date is flagged as non-temporal (unlikely to be
   4022 ///    used again soon). Exception and trap behavior for elements not selected
   4023 ///    for storage to memory are implementation dependent.
   4024 ///
   4025 /// \headerfile <x86intrin.h>
   4026 ///
   4027 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
   4028 ///   instruction.
   4029 ///
   4030 /// \param __d
   4031 ///    A 128-bit integer vector containing the values to be moved.
   4032 /// \param __n
   4033 ///    A 128-bit integer vector containing the mask. The most significant bit of
   4034 ///    each byte represents the mask bits.
   4035 /// \param __p
   4036 ///    A pointer to an unaligned 128-bit memory location where the specified
   4037 ///    values are moved.
   4038 static __inline__ void __DEFAULT_FN_ATTRS
   4039 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
   4040 {
   4041   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
   4042 }
   4043 
   4044 /// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
   4045 ///    a memory location.
   4046 ///
   4047 /// \headerfile <x86intrin.h>
   4048 ///
   4049 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
   4050 ///
   4051 /// \param __p
   4052 ///    A pointer to a 64-bit memory location that will receive the lower 64 bits
   4053 ///    of the integer vector parameter.
   4054 /// \param __a
   4055 ///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
   4056 ///    value to be stored.
   4057 static __inline__ void __DEFAULT_FN_ATTRS
   4058 _mm_storel_epi64(__m128i *__p, __m128i __a)
   4059 {
   4060   struct __mm_storel_epi64_struct {
   4061     long long __u;
   4062   } __attribute__((__packed__, __may_alias__));
   4063   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
   4064 }
   4065 
   4066 /// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
   4067 ///    aligned memory location.
   4068 ///
   4069 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
   4070 ///    used again soon).
   4071 ///
   4072 /// \headerfile <x86intrin.h>
   4073 ///
   4074 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
   4075 ///
   4076 /// \param __p
   4077 ///    A pointer to the 128-bit aligned memory location used to store the value.
   4078 /// \param __a
   4079 ///    A vector of [2 x double] containing the 64-bit values to be stored.
   4080 static __inline__ void __DEFAULT_FN_ATTRS
   4081 _mm_stream_pd(double *__p, __m128d __a)
   4082 {
   4083   __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
   4084 }
   4085 
   4086 /// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
   4087 ///
   4088 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
   4089 ///    used again soon).
   4090 ///
   4091 /// \headerfile <x86intrin.h>
   4092 ///
   4093 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
   4094 ///
   4095 /// \param __p
   4096 ///    A pointer to the 128-bit aligned memory location used to store the value.
   4097 /// \param __a
   4098 ///    A 128-bit integer vector containing the values to be stored.
   4099 static __inline__ void __DEFAULT_FN_ATTRS
   4100 _mm_stream_si128(__m128i *__p, __m128i __a)
   4101 {
   4102   __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
   4103 }
   4104 
   4105 /// \brief Stores a 32-bit integer value in the specified memory location.
   4106 ///
   4107 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
   4108 ///    used again soon).
   4109 ///
   4110 /// \headerfile <x86intrin.h>
   4111 ///
   4112 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
   4113 ///
   4114 /// \param __p
   4115 ///    A pointer to the 32-bit memory location used to store the value.
   4116 /// \param __a
   4117 ///    A 32-bit integer containing the value to be stored.
   4118 static __inline__ void __DEFAULT_FN_ATTRS
   4119 _mm_stream_si32(int *__p, int __a)
   4120 {
   4121   __builtin_ia32_movnti(__p, __a);
   4122 }
   4123 
   4124 #ifdef __x86_64__
   4125 /// \brief Stores a 64-bit integer value in the specified memory location.
   4126 ///
   4127 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
   4128 ///    used again soon).
   4129 ///
   4130 /// \headerfile <x86intrin.h>
   4131 ///
   4132 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
   4133 ///
   4134 /// \param __p
   4135 ///    A pointer to the 64-bit memory location used to store the value.
   4136 /// \param __a
   4137 ///    A 64-bit integer containing the value to be stored.
   4138 static __inline__ void __DEFAULT_FN_ATTRS
   4139 _mm_stream_si64(long long *__p, long long __a)
   4140 {
   4141   __builtin_ia32_movnti64(__p, __a);
   4142 }
   4143 #endif
   4144 
   4145 #if defined(__cplusplus)
   4146 extern "C" {
   4147 #endif
   4148 
   4149 /// \brief The cache line containing \a __p is flushed and invalidated from all
   4150 ///    caches in the coherency domain.
   4151 ///
   4152 /// \headerfile <x86intrin.h>
   4153 ///
   4154 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
   4155 ///
   4156 /// \param __p
   4157 ///    A pointer to the memory location used to identify the cache line to be
   4158 ///    flushed.
   4159 void _mm_clflush(void const * __p);
   4160 
   4161 /// \brief Forces strong memory ordering (serialization) between load
   4162 ///    instructions preceding this instruction and load instructions following
   4163 ///    this instruction, ensuring the system completes all previous loads before
   4164 ///    executing subsequent loads.
   4165 ///
   4166 /// \headerfile <x86intrin.h>
   4167 ///
   4168 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
   4169 ///
   4170 void _mm_lfence(void);
   4171 
   4172 /// \brief Forces strong memory ordering (serialization) between load and store
   4173 ///    instructions preceding this instruction and load and store instructions
   4174 ///    following this instruction, ensuring that the system completes all
   4175 ///    previous memory accesses before executing subsequent memory accesses.
   4176 ///
   4177 /// \headerfile <x86intrin.h>
   4178 ///
   4179 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
   4180 ///
   4181 void _mm_mfence(void);
   4182 
   4183 #if defined(__cplusplus)
   4184 } // extern "C"
   4185 #endif
   4186 
   4187 /// \brief Converts 16-bit signed integers from both 128-bit integer vector
   4188 ///    operands into 8-bit signed integers, and packs the results into the
   4189 ///    destination. Positive values greater than 0x7F are saturated to 0x7F.
   4190 ///    Negative values less than 0x80 are saturated to 0x80.
   4191 ///
   4192 /// \headerfile <x86intrin.h>
   4193 ///
   4194 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
   4195 ///
   4196 /// \param __a
   4197 ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
   4198 ///   a signed integer and is converted to a 8-bit signed integer with
   4199 ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
   4200 ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
   4201 ///   written to the lower 64 bits of the result.
   4202 /// \param __b
   4203 ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
   4204 ///   a signed integer and is converted to a 8-bit signed integer with
   4205 ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
   4206 ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
   4207 ///   written to the higher 64 bits of the result.
   4208 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
   4209 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4210 _mm_packs_epi16(__m128i __a, __m128i __b)
   4211 {
   4212   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
   4213 }
   4214 
   4215 /// \brief Converts 32-bit signed integers from both 128-bit integer vector
   4216 ///    operands into 16-bit signed integers, and packs the results into the
   4217 ///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
   4218 ///    Negative values less than 0x8000 are saturated to 0x8000.
   4219 ///
   4220 /// \headerfile <x86intrin.h>
   4221 ///
   4222 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
   4223 ///
   4224 /// \param __a
   4225 ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
   4226 ///    a signed integer and is converted to a 16-bit signed integer with
   4227 ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
   4228 ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
   4229 ///    are written to the lower 64 bits of the result.
   4230 /// \param __b
   4231 ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
   4232 ///    a signed integer and is converted to a 16-bit signed integer with
   4233 ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
   4234 ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
   4235 ///    are written to the higher 64 bits of the result.
   4236 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
   4237 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4238 _mm_packs_epi32(__m128i __a, __m128i __b)
   4239 {
   4240   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
   4241 }
   4242 
   4243 /// \brief Converts 16-bit signed integers from both 128-bit integer vector
   4244 ///    operands into 8-bit unsigned integers, and packs the results into the
   4245 ///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
   4246 ///    than 0x00 are saturated to 0x00.
   4247 ///
   4248 /// \headerfile <x86intrin.h>
   4249 ///
   4250 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
   4251 ///
   4252 /// \param __a
   4253 ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
   4254 ///    a signed integer and is converted to an 8-bit unsigned integer with
   4255 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
   4256 ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
   4257 ///    written to the lower 64 bits of the result.
   4258 /// \param __b
   4259 ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
   4260 ///    a signed integer and is converted to an 8-bit unsigned integer with
   4261 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
   4262 ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
   4263 ///    written to the higher 64 bits of the result.
   4264 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
   4265 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4266 _mm_packus_epi16(__m128i __a, __m128i __b)
   4267 {
   4268   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
   4269 }
   4270 
   4271 /// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
   4272 ///    the immediate-value parameter as a selector.
   4273 ///
   4274 /// \headerfile <x86intrin.h>
   4275 ///
   4276 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
   4277 ///
   4278 /// \param __a
   4279 ///    A 128-bit integer vector.
   4280 /// \param __imm
   4281 ///    An immediate value. Bits [2:0] selects values from \a __a to be assigned
   4282 ///    to bits[15:0] of the result. \n
   4283 ///    000: assign values from bits [15:0] of \a __a. \n
   4284 ///    001: assign values from bits [31:16] of \a __a. \n
   4285 ///    010: assign values from bits [47:32] of \a __a. \n
   4286 ///    011: assign values from bits [63:48] of \a __a. \n
   4287 ///    100: assign values from bits [79:64] of \a __a. \n
   4288 ///    101: assign values from bits [95:80] of \a __a. \n
   4289 ///    110: assign values from bits [111:96] of \a __a. \n
   4290 ///    111: assign values from bits [127:112] of \a __a.
   4291 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
   4292 ///    integer vector parameter and the remaining bits are assigned zeros.
   4293 static __inline__ int __DEFAULT_FN_ATTRS
   4294 _mm_extract_epi16(__m128i __a, int __imm)
   4295 {
   4296   __v8hi __b = (__v8hi)__a;
   4297   return (unsigned short)__b[__imm & 7];
   4298 }
   4299 
   4300 /// \brief Constructs a 128-bit integer vector by first making a copy of the
   4301 ///    128-bit integer vector parameter, and then inserting the lower 16 bits
   4302 ///    of an integer parameter into an offset specified by the immediate-value
   4303 ///    parameter.
   4304 ///
   4305 /// \headerfile <x86intrin.h>
   4306 ///
   4307 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
   4308 ///
   4309 /// \param __a
   4310 ///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
   4311 ///    result and then one of the eight elements in the result is replaced by
   4312 ///    the lower 16 bits of \a __b.
   4313 /// \param __b
   4314 ///    An integer. The lower 16 bits of this parameter are written to the
   4315 ///    result beginning at an offset specified by \a __imm.
   4316 /// \param __imm
   4317 ///    An immediate value specifying the bit offset in the result at which the
   4318 ///    lower 16 bits of \a __b are written.
   4319 /// \returns A 128-bit integer vector containing the constructed values.
   4320 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4321 _mm_insert_epi16(__m128i __a, int __b, int __imm)
   4322 {
   4323   __v8hi __c = (__v8hi)__a;
   4324   __c[__imm & 7] = __b;
   4325   return (__m128i)__c;
   4326 }
   4327 
   4328 /// \brief Copies the values of the most significant bits from each 8-bit
   4329 ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
   4330 ///    value, zero-extends the value, and writes it to the destination.
   4331 ///
   4332 /// \headerfile <x86intrin.h>
   4333 ///
   4334 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
   4335 ///
   4336 /// \param __a
   4337 ///    A 128-bit integer vector containing the values with bits to be extracted.
   4338 /// \returns The most significant bits from each 8-bit element in \a __a,
   4339 ///    written to bits [15:0]. The other bits are assigned zeros.
   4340 static __inline__ int __DEFAULT_FN_ATTRS
   4341 _mm_movemask_epi8(__m128i __a)
   4342 {
   4343   return __builtin_ia32_pmovmskb128((__v16qi)__a);
   4344 }
   4345 
   4346 /// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
   4347 ///    elements of a 128-bit integer vector parameter, using the immediate-value
   4348 ///    parameter as a specifier.
   4349 ///
   4350 /// \headerfile <x86intrin.h>
   4351 ///
   4352 /// \code
   4353 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
   4354 /// \endcode
   4355 ///
   4356 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
   4357 ///
   4358 /// \param a
   4359 ///    A 128-bit integer vector containing the values to be copied.
   4360 /// \param imm
   4361 ///    An immediate value containing an 8-bit value specifying which elements to
   4362 ///    copy from a. The destinations within the 128-bit destination are assigned
   4363 ///    values as follows: \n
   4364 ///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
   4365 ///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
   4366 ///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
   4367 ///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
   4368 ///    Bit value assignments: \n
   4369 ///    00: assign values from bits [31:0] of \a a. \n
   4370 ///    01: assign values from bits [63:32] of \a a. \n
   4371 ///    10: assign values from bits [95:64] of \a a. \n
   4372 ///    11: assign values from bits [127:96] of \a a.
   4373 /// \returns A 128-bit integer vector containing the shuffled values.
   4374 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
   4375   (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
   4376                                    (__v4si)_mm_undefined_si128(), \
   4377                                    ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
   4378                                    ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
   4379 
   4380 /// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
   4381 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
   4382 ///    value parameter as a specifier.
   4383 ///
   4384 /// \headerfile <x86intrin.h>
   4385 ///
   4386 /// \code
   4387 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
   4388 /// \endcode
   4389 ///
   4390 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
   4391 ///
   4392 /// \param a
   4393 ///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
   4394 ///    [127:64] of the result.
   4395 /// \param imm
   4396 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
   4397 ///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
   4398 ///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
   4399 ///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
   4400 ///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
   4401 ///    Bit value assignments: \n
   4402 ///    00: assign values from bits [15:0] of \a a. \n
   4403 ///    01: assign values from bits [31:16] of \a a. \n
   4404 ///    10: assign values from bits [47:32] of \a a. \n
   4405 ///    11: assign values from bits [63:48] of \a a. \n
   4406 /// \returns A 128-bit integer vector containing the shuffled values.
   4407 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
   4408   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
   4409                                    (__v8hi)_mm_undefined_si128(), \
   4410                                    ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
   4411                                    ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
   4412                                    4, 5, 6, 7); })
   4413 
   4414 /// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
   4415 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
   4416 ///    value parameter as a specifier.
   4417 ///
   4418 /// \headerfile <x86intrin.h>
   4419 ///
   4420 /// \code
   4421 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
   4422 /// \endcode
   4423 ///
   4424 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
   4425 ///
   4426 /// \param a
   4427 ///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
   4428 ///    [63:0] of the result.
   4429 /// \param imm
   4430 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
   4431 ///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
   4432 ///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
   4433 ///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
   4434 ///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
   4435 ///    Bit value assignments: \n
   4436 ///    00: assign values from bits [79:64] of \a a. \n
   4437 ///    01: assign values from bits [95:80] of \a a. \n
   4438 ///    10: assign values from bits [111:96] of \a a. \n
   4439 ///    11: assign values from bits [127:112] of \a a. \n
   4440 /// \returns A 128-bit integer vector containing the shuffled values.
   4441 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
   4442   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
   4443                                    (__v8hi)_mm_undefined_si128(), \
   4444                                    0, 1, 2, 3, \
   4445                                    4 + (((imm) >> 0) & 0x3), \
   4446                                    4 + (((imm) >> 2) & 0x3), \
   4447                                    4 + (((imm) >> 4) & 0x3), \
   4448                                    4 + (((imm) >> 6) & 0x3)); })
   4449 
   4450 /// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
   4451 ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
   4452 ///
   4453 /// \headerfile <x86intrin.h>
   4454 ///
   4455 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
   4456 ///   instruction.
   4457 ///
   4458 /// \param __a
   4459 ///    A 128-bit vector of [16 x i8].
   4460 ///    Bits [71:64] are written to bits [7:0] of the result. \n
   4461 ///    Bits [79:72] are written to bits [23:16] of the result. \n
   4462 ///    Bits [87:80] are written to bits [39:32] of the result. \n
   4463 ///    Bits [95:88] are written to bits [55:48] of the result. \n
   4464 ///    Bits [103:96] are written to bits [71:64] of the result. \n
   4465 ///    Bits [111:104] are written to bits [87:80] of the result. \n
   4466 ///    Bits [119:112] are written to bits [103:96] of the result. \n
   4467 ///    Bits [127:120] are written to bits [119:112] of the result.
   4468 /// \param __b
   4469 ///    A 128-bit vector of [16 x i8]. \n
   4470 ///    Bits [71:64] are written to bits [15:8] of the result. \n
   4471 ///    Bits [79:72] are written to bits [31:24] of the result. \n
   4472 ///    Bits [87:80] are written to bits [47:40] of the result. \n
   4473 ///    Bits [95:88] are written to bits [63:56] of the result. \n
   4474 ///    Bits [103:96] are written to bits [79:72] of the result. \n
   4475 ///    Bits [111:104] are written to bits [95:88] of the result. \n
   4476 ///    Bits [119:112] are written to bits [111:104] of the result. \n
   4477 ///    Bits [127:120] are written to bits [127:120] of the result.
   4478 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
   4479 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4480 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
   4481 {
   4482   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
   4483 }
   4484 
   4485 /// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
   4486 ///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
   4487 ///
   4488 /// \headerfile <x86intrin.h>
   4489 ///
   4490 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
   4491 ///   instruction.
   4492 ///
   4493 /// \param __a
   4494 ///    A 128-bit vector of [8 x i16].
   4495 ///    Bits [79:64] are written to bits [15:0] of the result. \n
   4496 ///    Bits [95:80] are written to bits [47:32] of the result. \n
   4497 ///    Bits [111:96] are written to bits [79:64] of the result. \n
   4498 ///    Bits [127:112] are written to bits [111:96] of the result.
   4499 /// \param __b
   4500 ///    A 128-bit vector of [8 x i16].
   4501 ///    Bits [79:64] are written to bits [31:16] of the result. \n
   4502 ///    Bits [95:80] are written to bits [63:48] of the result. \n
   4503 ///    Bits [111:96] are written to bits [95:80] of the result. \n
   4504 ///    Bits [127:112] are written to bits [127:112] of the result.
   4505 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
   4506 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4507 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
   4508 {
   4509   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
   4510 }
   4511 
   4512 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
   4513 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
   4514 ///
   4515 /// \headerfile <x86intrin.h>
   4516 ///
   4517 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
   4518 ///   instruction.
   4519 ///
   4520 /// \param __a
   4521 ///    A 128-bit vector of [4 x i32]. \n
   4522 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
   4523 ///    Bits [127:96] are written to bits [95:64] of the destination.
   4524 /// \param __b
   4525 ///    A 128-bit vector of [4 x i32]. \n
   4526 ///    Bits [95:64] are written to bits [64:32] of the destination. \n
   4527 ///    Bits [127:96] are written to bits [127:96] of the destination.
   4528 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
   4529 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4530 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
   4531 {
   4532   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
   4533 }
   4534 
   4535 /// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
   4536 ///    of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
   4537 ///
   4538 /// \headerfile <x86intrin.h>
   4539 ///
   4540 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
   4541 ///   instruction.
   4542 ///
   4543 /// \param __a
   4544 ///    A 128-bit vector of [2 x i64]. \n
   4545 ///    Bits [127:64] are written to bits [63:0] of the destination.
   4546 /// \param __b
   4547 ///    A 128-bit vector of [2 x i64]. \n
   4548 ///    Bits [127:64] are written to bits [127:64] of the destination.
   4549 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
   4550 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4551 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
   4552 {
   4553   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
   4554 }
   4555 
   4556 /// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
   4557 ///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
   4558 ///
   4559 /// \headerfile <x86intrin.h>
   4560 ///
   4561 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
   4562 ///   instruction.
   4563 ///
   4564 /// \param __a
   4565 ///    A 128-bit vector of [16 x i8]. \n
   4566 ///    Bits [7:0] are written to bits [7:0] of the result. \n
   4567 ///    Bits [15:8] are written to bits [23:16] of the result. \n
   4568 ///    Bits [23:16] are written to bits [39:32] of the result. \n
   4569 ///    Bits [31:24] are written to bits [55:48] of the result. \n
   4570 ///    Bits [39:32] are written to bits [71:64] of the result. \n
   4571 ///    Bits [47:40] are written to bits [87:80] of the result. \n
   4572 ///    Bits [55:48] are written to bits [103:96] of the result. \n
   4573 ///    Bits [63:56] are written to bits [119:112] of the result.
   4574 /// \param __b
   4575 ///    A 128-bit vector of [16 x i8].
   4576 ///    Bits [7:0] are written to bits [15:8] of the result. \n
   4577 ///    Bits [15:8] are written to bits [31:24] of the result. \n
   4578 ///    Bits [23:16] are written to bits [47:40] of the result. \n
   4579 ///    Bits [31:24] are written to bits [63:56] of the result. \n
   4580 ///    Bits [39:32] are written to bits [79:72] of the result. \n
   4581 ///    Bits [47:40] are written to bits [95:88] of the result. \n
   4582 ///    Bits [55:48] are written to bits [111:104] of the result. \n
   4583 ///    Bits [63:56] are written to bits [127:120] of the result.
   4584 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
   4585 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4586 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
   4587 {
   4588   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
   4589 }
   4590 
   4591 /// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
   4592 ///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
   4593 ///    [8 x i16].
   4594 ///
   4595 /// \headerfile <x86intrin.h>
   4596 ///
   4597 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
   4598 ///   instruction.
   4599 ///
   4600 /// \param __a
   4601 ///    A 128-bit vector of [8 x i16].
   4602 ///    Bits [15:0] are written to bits [15:0] of the result. \n
   4603 ///    Bits [31:16] are written to bits [47:32] of the result. \n
   4604 ///    Bits [47:32] are written to bits [79:64] of the result. \n
   4605 ///    Bits [63:48] are written to bits [111:96] of the result.
   4606 /// \param __b
   4607 ///    A 128-bit vector of [8 x i16].
   4608 ///    Bits [15:0] are written to bits [31:16] of the result. \n
   4609 ///    Bits [31:16] are written to bits [63:48] of the result. \n
   4610 ///    Bits [47:32] are written to bits [95:80] of the result. \n
   4611 ///    Bits [63:48] are written to bits [127:112] of the result.
   4612 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
   4613 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4614 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
   4615 {
   4616   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
   4617 }
   4618 
   4619 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
   4620 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
   4621 ///
   4622 /// \headerfile <x86intrin.h>
   4623 ///
   4624 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
   4625 ///   instruction.
   4626 ///
   4627 /// \param __a
   4628 ///    A 128-bit vector of [4 x i32]. \n
   4629 ///    Bits [31:0] are written to bits [31:0] of the destination. \n
   4630 ///    Bits [63:32] are written to bits [95:64] of the destination.
   4631 /// \param __b
   4632 ///    A 128-bit vector of [4 x i32]. \n
   4633 ///    Bits [31:0] are written to bits [64:32] of the destination. \n
   4634 ///    Bits [63:32] are written to bits [127:96] of the destination.
   4635 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
   4636 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4637 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
   4638 {
   4639   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
   4640 }
   4641 
   4642 /// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
   4643 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
   4644 ///
   4645 /// \headerfile <x86intrin.h>
   4646 ///
   4647 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
   4648 ///   instruction.
   4649 ///
   4650 /// \param __a
   4651 ///    A 128-bit vector of [2 x i64]. \n
   4652 ///    Bits [63:0] are written to bits [63:0] of the destination. \n
   4653 /// \param __b
   4654 ///    A 128-bit vector of [2 x i64]. \n
   4655 ///    Bits [63:0] are written to bits [127:64] of the destination. \n
   4656 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
   4657 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4658 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
   4659 {
   4660   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
   4661 }
   4662 
   4663 /// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
   4664 ///    integer.
   4665 ///
   4666 /// \headerfile <x86intrin.h>
   4667 ///
   4668 /// This intrinsic has no corresponding instruction.
   4669 ///
   4670 /// \param __a
   4671 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
   4672 ///    destination.
   4673 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
   4674 static __inline__ __m64 __DEFAULT_FN_ATTRS
   4675 _mm_movepi64_pi64(__m128i __a)
   4676 {
   4677   return (__m64)__a[0];
   4678 }
   4679 
   4680 /// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
   4681 ///    upper bits.
   4682 ///
   4683 /// \headerfile <x86intrin.h>
   4684 ///
   4685 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
   4686 ///
   4687 /// \param __a
   4688 ///    A 64-bit value.
   4689 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
   4690 ///    the operand. The upper 64 bits are assigned zeros.
   4691 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4692 _mm_movpi64_epi64(__m64 __a)
   4693 {
   4694   return (__m128i){ (long long)__a, 0 };
   4695 }
   4696 
   4697 /// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
   4698 ///    integer vector, zeroing the upper bits.
   4699 ///
   4700 /// \headerfile <x86intrin.h>
   4701 ///
   4702 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
   4703 ///
   4704 /// \param __a
   4705 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
   4706 ///    destination.
   4707 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
   4708 ///    the operand. The upper 64 bits are assigned zeros.
   4709 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4710 _mm_move_epi64(__m128i __a)
   4711 {
   4712   return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
   4713 }
   4714 
   4715 /// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
   4716 ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
   4717 ///    double].
   4718 ///
   4719 /// \headerfile <x86intrin.h>
   4720 ///
   4721 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
   4722 ///
   4723 /// \param __a
   4724 ///    A 128-bit vector of [2 x double]. \n
   4725 ///    Bits [127:64] are written to bits [63:0] of the destination.
   4726 /// \param __b
   4727 ///    A 128-bit vector of [2 x double]. \n
   4728 ///    Bits [127:64] are written to bits [127:64] of the destination.
   4729 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
   4730 static __inline__ __m128d __DEFAULT_FN_ATTRS
   4731 _mm_unpackhi_pd(__m128d __a, __m128d __b)
   4732 {
   4733   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
   4734 }
   4735 
   4736 /// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
   4737 ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
   4738 ///    double].
   4739 ///
   4740 /// \headerfile <x86intrin.h>
   4741 ///
   4742 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
   4743 ///
   4744 /// \param __a
   4745 ///    A 128-bit vector of [2 x double]. \n
   4746 ///    Bits [63:0] are written to bits [63:0] of the destination.
   4747 /// \param __b
   4748 ///    A 128-bit vector of [2 x double]. \n
   4749 ///    Bits [63:0] are written to bits [127:64] of the destination.
   4750 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
   4751 static __inline__ __m128d __DEFAULT_FN_ATTRS
   4752 _mm_unpacklo_pd(__m128d __a, __m128d __b)
   4753 {
   4754   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
   4755 }
   4756 
   4757 /// \brief Extracts the sign bits of the double-precision values in the 128-bit
   4758 ///    vector of [2 x double], zero-extends the value, and writes it to the
   4759 ///    low-order bits of the destination.
   4760 ///
   4761 /// \headerfile <x86intrin.h>
   4762 ///
   4763 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
   4764 ///
   4765 /// \param __a
   4766 ///    A 128-bit vector of [2 x double] containing the values with sign bits to
   4767 ///    be extracted.
   4768 /// \returns The sign bits from each of the double-precision elements in \a __a,
   4769 ///    written to bits [1:0]. The remaining bits are assigned values of zero.
   4770 static __inline__ int __DEFAULT_FN_ATTRS
   4771 _mm_movemask_pd(__m128d __a)
   4772 {
   4773   return __builtin_ia32_movmskpd((__v2df)__a);
   4774 }
   4775 
   4776 
   4777 /// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
   4778 ///    128-bit vector parameters of [2 x double], using the immediate-value
   4779 ///     parameter as a specifier.
   4780 ///
   4781 /// \headerfile <x86intrin.h>
   4782 ///
   4783 /// \code
   4784 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
   4785 /// \endcode
   4786 ///
   4787 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
   4788 ///
   4789 /// \param a
   4790 ///    A 128-bit vector of [2 x double].
   4791 /// \param b
   4792 ///    A 128-bit vector of [2 x double].
   4793 /// \param i
   4794 ///    An 8-bit immediate value. The least significant two bits specify which
   4795 ///    elements to copy from a and b: \n
   4796 ///    Bit[0] = 0: lower element of a copied to lower element of result. \n
   4797 ///    Bit[0] = 1: upper element of a copied to lower element of result. \n
   4798 ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
   4799 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
   4800 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
   4801 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
   4802   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
   4803                                    0 + (((i) >> 0) & 0x1), \
   4804                                    2 + (((i) >> 1) & 0x1)); })
   4805 
   4806 /// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
   4807 ///    floating-point vector of [4 x float].
   4808 ///
   4809 /// \headerfile <x86intrin.h>
   4810 ///
   4811 /// This intrinsic has no corresponding instruction.
   4812 ///
   4813 /// \param __a
   4814 ///    A 128-bit floating-point vector of [2 x double].
   4815 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
   4816 ///    bitwise pattern as the parameter.
   4817 static __inline__ __m128 __DEFAULT_FN_ATTRS
   4818 _mm_castpd_ps(__m128d __a)
   4819 {
   4820   return (__m128)__a;
   4821 }
   4822 
   4823 /// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
   4824 ///    integer vector.
   4825 ///
   4826 /// \headerfile <x86intrin.h>
   4827 ///
   4828 /// This intrinsic has no corresponding instruction.
   4829 ///
   4830 /// \param __a
   4831 ///    A 128-bit floating-point vector of [2 x double].
   4832 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
   4833 ///    parameter.
   4834 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4835 _mm_castpd_si128(__m128d __a)
   4836 {
   4837   return (__m128i)__a;
   4838 }
   4839 
   4840 /// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
   4841 ///    floating-point vector of [2 x double].
   4842 ///
   4843 /// \headerfile <x86intrin.h>
   4844 ///
   4845 /// This intrinsic has no corresponding instruction.
   4846 ///
   4847 /// \param __a
   4848 ///    A 128-bit floating-point vector of [4 x float].
   4849 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
   4850 ///    bitwise pattern as the parameter.
   4851 static __inline__ __m128d __DEFAULT_FN_ATTRS
   4852 _mm_castps_pd(__m128 __a)
   4853 {
   4854   return (__m128d)__a;
   4855 }
   4856 
   4857 /// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
   4858 ///    integer vector.
   4859 ///
   4860 /// \headerfile <x86intrin.h>
   4861 ///
   4862 /// This intrinsic has no corresponding instruction.
   4863 ///
   4864 /// \param __a
   4865 ///    A 128-bit floating-point vector of [4 x float].
   4866 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
   4867 ///    parameter.
   4868 static __inline__ __m128i __DEFAULT_FN_ATTRS
   4869 _mm_castps_si128(__m128 __a)
   4870 {
   4871   return (__m128i)__a;
   4872 }
   4873 
   4874 /// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
   4875 ///    of [4 x float].
   4876 ///
   4877 /// \headerfile <x86intrin.h>
   4878 ///
   4879 /// This intrinsic has no corresponding instruction.
   4880 ///
   4881 /// \param __a
   4882 ///    A 128-bit integer vector.
   4883 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
   4884 ///    bitwise pattern as the parameter.
   4885 static __inline__ __m128 __DEFAULT_FN_ATTRS
   4886 _mm_castsi128_ps(__m128i __a)
   4887 {
   4888   return (__m128)__a;
   4889 }
   4890 
   4891 /// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
   4892 ///    of [2 x double].
   4893 ///
   4894 /// \headerfile <x86intrin.h>
   4895 ///
   4896 /// This intrinsic has no corresponding instruction.
   4897 ///
   4898 /// \param __a
   4899 ///    A 128-bit integer vector.
   4900 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
   4901 ///    bitwise pattern as the parameter.
   4902 static __inline__ __m128d __DEFAULT_FN_ATTRS
   4903 _mm_castsi128_pd(__m128i __a)
   4904 {
   4905   return (__m128d)__a;
   4906 }
   4907 
   4908 #if defined(__cplusplus)
   4909 extern "C" {
   4910 #endif
   4911 
   4912 /// \brief Indicates that a spin loop is being executed for the purposes of
   4913 ///    optimizing power consumption during the loop.
   4914 ///
   4915 /// \headerfile <x86intrin.h>
   4916 ///
   4917 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
   4918 ///
   4919 void _mm_pause(void);
   4920 
   4921 #if defined(__cplusplus)
   4922 } // extern "C"
   4923 #endif
   4924 #undef __DEFAULT_FN_ATTRS
   4925 
   4926 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
   4927 
   4928 #define _MM_DENORMALS_ZERO_ON   (0x0040)
   4929 #define _MM_DENORMALS_ZERO_OFF  (0x0000)
   4930 
   4931 #define _MM_DENORMALS_ZERO_MASK (0x0040)
   4932 
   4933 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
   4934 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
   4935 
   4936 #endif /* __EMMINTRIN_H */
   4937