Home | History | Annotate | Download | only in Headers
      1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __XMMINTRIN_H
     25 #define __XMMINTRIN_H
     26 
     27 #include <mmintrin.h>
     28 
     29 typedef int __v4si __attribute__((__vector_size__(16)));
     30 typedef float __v4sf __attribute__((__vector_size__(16)));
     31 typedef float __m128 __attribute__((__vector_size__(16)));
     32 
     33 /* Unsigned types */
     34 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
     35 
     36 /* This header should only be included in a hosted environment as it depends on
     37  * a standard library to provide allocation routines. */
     38 #if __STDC_HOSTED__
     39 #include <mm_malloc.h>
     40 #endif
     41 
     42 /* Define the default attributes for the functions in this file. */
     43 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
     44 
     45 /// \brief Adds the 32-bit float values in the low-order bits of the operands.
     46 ///
     47 /// \headerfile <x86intrin.h>
     48 ///
     49 /// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
     50 ///
     51 /// \param __a
     52 ///    A 128-bit vector of [4 x float] containing one of the source operands.
     53 ///    The lower 32 bits of this operand are used in the calculation.
     54 /// \param __b
     55 ///    A 128-bit vector of [4 x float] containing one of the source operands.
     56 ///    The lower 32 bits of this operand are used in the calculation.
     57 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
     58 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
     59 ///    the upper 96 bits of the first source operand.
     60 static __inline__ __m128 __DEFAULT_FN_ATTRS
     61 _mm_add_ss(__m128 __a, __m128 __b)
     62 {
     63   __a[0] += __b[0];
     64   return __a;
     65 }
     66 
     67 /// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
     68 ///    the addition.
     69 ///
     70 /// \headerfile <x86intrin.h>
     71 ///
     72 /// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
     73 ///
     74 /// \param __a
     75 ///    A 128-bit vector of [4 x float] containing one of the source operands.
     76 /// \param __b
     77 ///    A 128-bit vector of [4 x float] containing one of the source operands.
     78 /// \returns A 128-bit vector of [4 x float] containing the sums of both
     79 ///    operands.
     80 static __inline__ __m128 __DEFAULT_FN_ATTRS
     81 _mm_add_ps(__m128 __a, __m128 __b)
     82 {
     83   return (__m128)((__v4sf)__a + (__v4sf)__b);
     84 }
     85 
     86 /// \brief Subtracts the 32-bit float value in the low-order bits of the second
     87 ///    operand from the corresponding value in the first operand.
     88 ///
     89 /// \headerfile <x86intrin.h>
     90 ///
     91 /// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
     92 ///
     93 /// \param __a
     94 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
     95 ///    of this operand are used in the calculation.
     96 /// \param __b
     97 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
     98 ///    bits of this operand are used in the calculation.
     99 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
    100 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
    101 ///    copied from the upper 96 bits of the first source operand.
    102 static __inline__ __m128 __DEFAULT_FN_ATTRS
    103 _mm_sub_ss(__m128 __a, __m128 __b)
    104 {
    105   __a[0] -= __b[0];
    106   return __a;
    107 }
    108 
    109 /// \brief Subtracts each of the values of the second operand from the first
    110 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
    111 ///    the results of the subtraction.
    112 ///
    113 /// \headerfile <x86intrin.h>
    114 ///
    115 /// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
    116 ///
    117 /// \param __a
    118 ///    A 128-bit vector of [4 x float] containing the minuend.
    119 /// \param __b
    120 ///    A 128-bit vector of [4 x float] containing the subtrahend.
    121 /// \returns A 128-bit vector of [4 x float] containing the differences between
    122 ///    both operands.
    123 static __inline__ __m128 __DEFAULT_FN_ATTRS
    124 _mm_sub_ps(__m128 __a, __m128 __b)
    125 {
    126   return (__m128)((__v4sf)__a - (__v4sf)__b);
    127 }
    128 
    129 /// \brief Multiplies two 32-bit float values in the low-order bits of the
    130 ///    operands.
    131 ///
    132 /// \headerfile <x86intrin.h>
    133 ///
    134 /// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
    135 ///
    136 /// \param __a
    137 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    138 ///    The lower 32 bits of this operand are used in the calculation.
    139 /// \param __b
    140 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    141 ///    The lower 32 bits of this operand are used in the calculation.
    142 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
    143 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
    144 ///    bits of the first source operand.
    145 static __inline__ __m128 __DEFAULT_FN_ATTRS
    146 _mm_mul_ss(__m128 __a, __m128 __b)
    147 {
    148   __a[0] *= __b[0];
    149   return __a;
    150 }
    151 
    152 /// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
    153 ///    results of the multiplication.
    154 ///
    155 /// \headerfile <x86intrin.h>
    156 ///
    157 /// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
    158 ///
    159 /// \param __a
    160 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    161 /// \param __b
    162 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    163 /// \returns A 128-bit vector of [4 x float] containing the products of both
    164 ///    operands.
    165 static __inline__ __m128 __DEFAULT_FN_ATTRS
    166 _mm_mul_ps(__m128 __a, __m128 __b)
    167 {
    168   return (__m128)((__v4sf)__a * (__v4sf)__b);
    169 }
    170 
    171 /// \brief Divides the value in the low-order 32 bits of the first operand by
    172 ///    the corresponding value in the second operand.
    173 ///
    174 /// \headerfile <x86intrin.h>
    175 ///
    176 /// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
    177 ///
    178 /// \param __a
    179 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
    180 ///    bits of this operand are used in the calculation.
    181 /// \param __b
    182 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
    183 ///    of this operand are used in the calculation.
    184 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
    185 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
    186 ///    upper 96 bits of the first source operand.
    187 static __inline__ __m128 __DEFAULT_FN_ATTRS
    188 _mm_div_ss(__m128 __a, __m128 __b)
    189 {
    190   __a[0] /= __b[0];
    191   return __a;
    192 }
    193 
    194 /// \brief Divides two 128-bit vectors of [4 x float].
    195 ///
    196 /// \headerfile <x86intrin.h>
    197 ///
    198 /// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
    199 ///
    200 /// \param __a
    201 ///    A 128-bit vector of [4 x float] containing the dividend.
    202 /// \param __b
    203 ///    A 128-bit vector of [4 x float] containing the divisor.
    204 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
    205 ///    operands.
    206 static __inline__ __m128 __DEFAULT_FN_ATTRS
    207 _mm_div_ps(__m128 __a, __m128 __b)
    208 {
    209   return (__m128)((__v4sf)__a / (__v4sf)__b);
    210 }
    211 
    212 /// \brief Calculates the square root of the value stored in the low-order bits
    213 ///    of a 128-bit vector of [4 x float].
    214 ///
    215 /// \headerfile <x86intrin.h>
    216 ///
    217 /// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
    218 ///
    219 /// \param __a
    220 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
    221 ///    used in the calculation.
    222 /// \returns A 128-bit vector of [4 x float] containing the square root of the
    223 ///    value in the low-order bits of the operand.
    224 static __inline__ __m128 __DEFAULT_FN_ATTRS
    225 _mm_sqrt_ss(__m128 __a)
    226 {
    227   __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
    228   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
    229 }
    230 
    231 /// \brief Calculates the square roots of the values stored in a 128-bit vector
    232 ///    of [4 x float].
    233 ///
    234 /// \headerfile <x86intrin.h>
    235 ///
    236 /// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
    237 ///
    238 /// \param __a
    239 ///    A 128-bit vector of [4 x float].
    240 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
    241 ///    values in the operand.
    242 static __inline__ __m128 __DEFAULT_FN_ATTRS
    243 _mm_sqrt_ps(__m128 __a)
    244 {
    245   return __builtin_ia32_sqrtps((__v4sf)__a);
    246 }
    247 
    248 /// \brief Calculates the approximate reciprocal of the value stored in the
    249 ///    low-order bits of a 128-bit vector of [4 x float].
    250 ///
    251 /// \headerfile <x86intrin.h>
    252 ///
    253 /// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
    254 ///
    255 /// \param __a
    256 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
    257 ///    used in the calculation.
    258 /// \returns A 128-bit vector of [4 x float] containing the approximate
    259 ///    reciprocal of the value in the low-order bits of the operand.
    260 static __inline__ __m128 __DEFAULT_FN_ATTRS
    261 _mm_rcp_ss(__m128 __a)
    262 {
    263   __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
    264   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
    265 }
    266 
    267 /// \brief Calculates the approximate reciprocals of the values stored in a
    268 ///    128-bit vector of [4 x float].
    269 ///
    270 /// \headerfile <x86intrin.h>
    271 ///
    272 /// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
    273 ///
    274 /// \param __a
    275 ///    A 128-bit vector of [4 x float].
    276 /// \returns A 128-bit vector of [4 x float] containing the approximate
    277 ///    reciprocals of the values in the operand.
    278 static __inline__ __m128 __DEFAULT_FN_ATTRS
    279 _mm_rcp_ps(__m128 __a)
    280 {
    281   return __builtin_ia32_rcpps((__v4sf)__a);
    282 }
    283 
    284 /// \brief Calculates the approximate reciprocal of the square root of the value
    285 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
    286 ///
    287 /// \headerfile <x86intrin.h>
    288 ///
    289 /// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
    290 ///
    291 /// \param __a
    292 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
    293 ///    used in the calculation.
    294 /// \returns A 128-bit vector of [4 x float] containing the approximate
    295 ///    reciprocal of the square root of the value in the low-order bits of the
    296 ///    operand.
    297 static __inline__ __m128 __DEFAULT_FN_ATTRS
    298 _mm_rsqrt_ss(__m128 __a)
    299 {
    300   __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
    301   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
    302 }
    303 
    304 /// \brief Calculates the approximate reciprocals of the square roots of the
    305 ///    values stored in a 128-bit vector of [4 x float].
    306 ///
    307 /// \headerfile <x86intrin.h>
    308 ///
    309 /// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
    310 ///
    311 /// \param __a
    312 ///    A 128-bit vector of [4 x float].
    313 /// \returns A 128-bit vector of [4 x float] containing the approximate
    314 ///    reciprocals of the square roots of the values in the operand.
    315 static __inline__ __m128 __DEFAULT_FN_ATTRS
    316 _mm_rsqrt_ps(__m128 __a)
    317 {
    318   return __builtin_ia32_rsqrtps((__v4sf)__a);
    319 }
    320 
    321 /// \brief Compares two 32-bit float values in the low-order bits of both
    322 ///    operands and returns the lesser value in the low-order bits of the
    323 ///    vector of [4 x float].
    324 ///
    325 /// \headerfile <x86intrin.h>
    326 ///
    327 /// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
    328 ///
    329 /// \param __a
    330 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    331 ///    32 bits of this operand are used in the comparison.
    332 /// \param __b
    333 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    334 ///    32 bits of this operand are used in the comparison.
    335 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
    336 ///    minimum value between both operands. The upper 96 bits are copied from
    337 ///    the upper 96 bits of the first source operand.
    338 static __inline__ __m128 __DEFAULT_FN_ATTRS
    339 _mm_min_ss(__m128 __a, __m128 __b)
    340 {
    341   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
    342 }
    343 
    344 /// \brief Compares two 128-bit vectors of [4 x float] and returns the
    345 ///    lesser of each pair of values.
    346 ///
    347 /// \headerfile <x86intrin.h>
    348 ///
    349 /// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
    350 ///
    351 /// \param __a
    352 ///    A 128-bit vector of [4 x float] containing one of the operands.
    353 /// \param __b
    354 ///    A 128-bit vector of [4 x float] containing one of the operands.
    355 /// \returns A 128-bit vector of [4 x float] containing the minimum values
    356 ///    between both operands.
    357 static __inline__ __m128 __DEFAULT_FN_ATTRS
    358 _mm_min_ps(__m128 __a, __m128 __b)
    359 {
    360   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
    361 }
    362 
    363 /// \brief Compares two 32-bit float values in the low-order bits of both
    364 ///    operands and returns the greater value in the low-order bits of
    365 ///    a vector [4 x float].
    366 ///
    367 /// \headerfile <x86intrin.h>
    368 ///
    369 /// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
    370 ///
    371 /// \param __a
    372 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    373 ///    32 bits of this operand are used in the comparison.
    374 /// \param __b
    375 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    376 ///    32 bits of this operand are used in the comparison.
    377 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
    378 ///    maximum value between both operands. The upper 96 bits are copied from
    379 ///    the upper 96 bits of the first source operand.
    380 static __inline__ __m128 __DEFAULT_FN_ATTRS
    381 _mm_max_ss(__m128 __a, __m128 __b)
    382 {
    383   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
    384 }
    385 
    386 /// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
    387 ///    of each pair of values.
    388 ///
    389 /// \headerfile <x86intrin.h>
    390 ///
    391 /// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
    392 ///
    393 /// \param __a
    394 ///    A 128-bit vector of [4 x float] containing one of the operands.
    395 /// \param __b
    396 ///    A 128-bit vector of [4 x float] containing one of the operands.
    397 /// \returns A 128-bit vector of [4 x float] containing the maximum values
    398 ///    between both operands.
    399 static __inline__ __m128 __DEFAULT_FN_ATTRS
    400 _mm_max_ps(__m128 __a, __m128 __b)
    401 {
    402   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
    403 }
    404 
    405 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
    406 ///
    407 /// \headerfile <x86intrin.h>
    408 ///
    409 /// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
    410 ///
    411 /// \param __a
    412 ///    A 128-bit vector containing one of the source operands.
    413 /// \param __b
    414 ///    A 128-bit vector containing one of the source operands.
    415 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
    416 ///    values between both operands.
    417 static __inline__ __m128 __DEFAULT_FN_ATTRS
    418 _mm_and_ps(__m128 __a, __m128 __b)
    419 {
    420   return (__m128)((__v4su)__a & (__v4su)__b);
    421 }
    422 
    423 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
    424 ///    the one's complement of the values contained in the first source
    425 ///    operand.
    426 ///
    427 /// \headerfile <x86intrin.h>
    428 ///
    429 /// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
    430 ///
    431 /// \param __a
    432 ///    A 128-bit vector of [4 x float] containing the first source operand. The
    433 ///    one's complement of this value is used in the bitwise AND.
    434 /// \param __b
    435 ///    A 128-bit vector of [4 x float] containing the second source operand.
    436 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
    437 ///    one's complement of the first operand and the values in the second
    438 ///    operand.
    439 static __inline__ __m128 __DEFAULT_FN_ATTRS
    440 _mm_andnot_ps(__m128 __a, __m128 __b)
    441 {
    442   return (__m128)(~(__v4su)__a & (__v4su)__b);
    443 }
    444 
    445 /// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
    446 ///
    447 /// \headerfile <x86intrin.h>
    448 ///
    449 /// This intrinsic corresponds to the \c VORPS / ORPS instructions.
    450 ///
    451 /// \param __a
    452 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    453 /// \param __b
    454 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    455 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
    456 ///    values between both operands.
    457 static __inline__ __m128 __DEFAULT_FN_ATTRS
    458 _mm_or_ps(__m128 __a, __m128 __b)
    459 {
    460   return (__m128)((__v4su)__a | (__v4su)__b);
    461 }
    462 
    463 /// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
    464 ///    [4 x float].
    465 ///
    466 /// \headerfile <x86intrin.h>
    467 ///
    468 /// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
    469 ///
    470 /// \param __a
    471 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    472 /// \param __b
    473 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    474 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
    475 ///    of the values between both operands.
    476 static __inline__ __m128 __DEFAULT_FN_ATTRS
    477 _mm_xor_ps(__m128 __a, __m128 __b)
    478 {
    479   return (__m128)((__v4su)__a ^ (__v4su)__b);
    480 }
    481 
    482 /// \brief Compares two 32-bit float values in the low-order bits of both
    483 ///    operands for equality and returns the result of the comparison in the
    484 ///    low-order bits of a vector [4 x float].
    485 ///
    486 /// \headerfile <x86intrin.h>
    487 ///
    488 /// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
    489 ///
    490 /// \param __a
    491 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    492 ///    32 bits of this operand are used in the comparison.
    493 /// \param __b
    494 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    495 ///    32 bits of this operand are used in the comparison.
    496 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    497 ///    in the low-order bits.
    498 static __inline__ __m128 __DEFAULT_FN_ATTRS
    499 _mm_cmpeq_ss(__m128 __a, __m128 __b)
    500 {
    501   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
    502 }
    503 
    504 /// \brief Compares each of the corresponding 32-bit float values of the
    505 ///    128-bit vectors of [4 x float] for equality.
    506 ///
    507 /// \headerfile <x86intrin.h>
    508 ///
    509 /// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
    510 ///
    511 /// \param __a
    512 ///    A 128-bit vector of [4 x float].
    513 /// \param __b
    514 ///    A 128-bit vector of [4 x float].
    515 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    516 static __inline__ __m128 __DEFAULT_FN_ATTRS
    517 _mm_cmpeq_ps(__m128 __a, __m128 __b)
    518 {
    519   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
    520 }
    521 
    522 /// \brief Compares two 32-bit float values in the low-order bits of both
    523 ///    operands to determine if the value in the first operand is less than the
    524 ///    corresponding value in the second operand and returns the result of the
    525 ///    comparison in the low-order bits of a vector of [4 x float].
    526 ///
    527 /// \headerfile <x86intrin.h>
    528 ///
    529 /// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
    530 ///
    531 /// \param __a
    532 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    533 ///    32 bits of this operand are used in the comparison.
    534 /// \param __b
    535 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    536 ///    32 bits of this operand are used in the comparison.
    537 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    538 ///    in the low-order bits.
    539 static __inline__ __m128 __DEFAULT_FN_ATTRS
    540 _mm_cmplt_ss(__m128 __a, __m128 __b)
    541 {
    542   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
    543 }
    544 
    545 /// \brief Compares each of the corresponding 32-bit float values of the
    546 ///    128-bit vectors of [4 x float] to determine if the values in the first
    547 ///    operand are less than those in the second operand.
    548 ///
    549 /// \headerfile <x86intrin.h>
    550 ///
    551 /// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
    552 ///
    553 /// \param __a
    554 ///    A 128-bit vector of [4 x float].
    555 /// \param __b
    556 ///    A 128-bit vector of [4 x float].
    557 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    558 static __inline__ __m128 __DEFAULT_FN_ATTRS
    559 _mm_cmplt_ps(__m128 __a, __m128 __b)
    560 {
    561   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
    562 }
    563 
    564 /// \brief Compares two 32-bit float values in the low-order bits of both
    565 ///    operands to determine if the value in the first operand is less than or
    566 ///    equal to the corresponding value in the second operand and returns the
    567 ///    result of the comparison in the low-order bits of a vector of
    568 ///    [4 x float].
    569 ///
    570 /// \headerfile <x86intrin.h>
    571 ///
    572 /// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
    573 ///
    574 /// \param __a
    575 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    576 ///    32 bits of this operand are used in the comparison.
    577 /// \param __b
    578 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    579 ///    32 bits of this operand are used in the comparison.
    580 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    581 ///    in the low-order bits.
    582 static __inline__ __m128 __DEFAULT_FN_ATTRS
    583 _mm_cmple_ss(__m128 __a, __m128 __b)
    584 {
    585   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
    586 }
    587 
    588 /// \brief Compares each of the corresponding 32-bit float values of the
    589 ///    128-bit vectors of [4 x float] to determine if the values in the first
    590 ///    operand are less than or equal to those in the second operand.
    591 ///
    592 /// \headerfile <x86intrin.h>
    593 ///
    594 /// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
    595 ///
    596 /// \param __a
    597 ///    A 128-bit vector of [4 x float].
    598 /// \param __b
    599 ///    A 128-bit vector of [4 x float].
    600 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    601 static __inline__ __m128 __DEFAULT_FN_ATTRS
    602 _mm_cmple_ps(__m128 __a, __m128 __b)
    603 {
    604   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
    605 }
    606 
    607 /// \brief Compares two 32-bit float values in the low-order bits of both
    608 ///    operands to determine if the value in the first operand is greater than
    609 ///    the corresponding value in the second operand and returns the result of
    610 ///    the comparison in the low-order bits of a vector of [4 x float].
    611 ///
    612 /// \headerfile <x86intrin.h>
    613 ///
    614 /// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
    615 ///
    616 /// \param __a
    617 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    618 ///    32 bits of this operand are used in the comparison.
    619 /// \param __b
    620 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    621 ///    32 bits of this operand are used in the comparison.
    622 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    623 ///    in the low-order bits.
    624 static __inline__ __m128 __DEFAULT_FN_ATTRS
    625 _mm_cmpgt_ss(__m128 __a, __m128 __b)
    626 {
    627   return (__m128)__builtin_shufflevector((__v4sf)__a,
    628                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
    629                                          4, 1, 2, 3);
    630 }
    631 
    632 /// \brief Compares each of the corresponding 32-bit float values of the
    633 ///    128-bit vectors of [4 x float] to determine if the values in the first
    634 ///    operand are greater than those in the second operand.
    635 ///
    636 /// \headerfile <x86intrin.h>
    637 ///
    638 /// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
    639 ///
    640 /// \param __a
    641 ///    A 128-bit vector of [4 x float].
    642 /// \param __b
    643 ///    A 128-bit vector of [4 x float].
    644 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    645 static __inline__ __m128 __DEFAULT_FN_ATTRS
    646 _mm_cmpgt_ps(__m128 __a, __m128 __b)
    647 {
    648   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
    649 }
    650 
    651 /// \brief Compares two 32-bit float values in the low-order bits of both
    652 ///    operands to determine if the value in the first operand is greater than
    653 ///    or equal to the corresponding value in the second operand and returns
    654 ///    the result of the comparison in the low-order bits of a vector of
    655 ///    [4 x float].
    656 ///
    657 /// \headerfile <x86intrin.h>
    658 ///
    659 /// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
    660 ///
    661 /// \param __a
    662 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    663 ///    32 bits of this operand are used in the comparison.
    664 /// \param __b
    665 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    666 ///    32 bits of this operand are used in the comparison.
    667 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    668 ///    in the low-order bits.
    669 static __inline__ __m128 __DEFAULT_FN_ATTRS
    670 _mm_cmpge_ss(__m128 __a, __m128 __b)
    671 {
    672   return (__m128)__builtin_shufflevector((__v4sf)__a,
    673                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
    674                                          4, 1, 2, 3);
    675 }
    676 
    677 /// \brief Compares each of the corresponding 32-bit float values of the
    678 ///    128-bit vectors of [4 x float] to determine if the values in the first
    679 ///    operand are greater than or equal to those in the second operand.
    680 ///
    681 /// \headerfile <x86intrin.h>
    682 ///
    683 /// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
    684 ///
    685 /// \param __a
    686 ///    A 128-bit vector of [4 x float].
    687 /// \param __b
    688 ///    A 128-bit vector of [4 x float].
    689 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    690 static __inline__ __m128 __DEFAULT_FN_ATTRS
    691 _mm_cmpge_ps(__m128 __a, __m128 __b)
    692 {
    693   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
    694 }
    695 
    696 /// \brief Compares two 32-bit float values in the low-order bits of both
    697 ///    operands for inequality and returns the result of the comparison in the
    698 ///    low-order bits of a vector of [4 x float].
    699 ///
    700 /// \headerfile <x86intrin.h>
    701 ///
    702 /// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
    703 ///
    704 /// \param __a
    705 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    706 ///    32 bits of this operand are used in the comparison.
    707 /// \param __b
    708 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    709 ///    32 bits of this operand are used in the comparison.
    710 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    711 ///    in the low-order bits.
    712 static __inline__ __m128 __DEFAULT_FN_ATTRS
    713 _mm_cmpneq_ss(__m128 __a, __m128 __b)
    714 {
    715   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
    716 }
    717 
    718 /// \brief Compares each of the corresponding 32-bit float values of the
    719 ///    128-bit vectors of [4 x float] for inequality.
    720 ///
    721 /// \headerfile <x86intrin.h>
    722 ///
    723 /// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
    724 ///
    725 /// \param __a
    726 ///    A 128-bit vector of [4 x float].
    727 /// \param __b
    728 ///    A 128-bit vector of [4 x float].
    729 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    730 static __inline__ __m128 __DEFAULT_FN_ATTRS
    731 _mm_cmpneq_ps(__m128 __a, __m128 __b)
    732 {
    733   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
    734 }
    735 
    736 /// \brief Compares two 32-bit float values in the low-order bits of both
    737 ///    operands to determine if the value in the first operand is not less than
    738 ///    the corresponding value in the second operand and returns the result of
    739 ///    the comparison in the low-order bits of a vector of [4 x float].
    740 ///
    741 /// \headerfile <x86intrin.h>
    742 ///
    743 /// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
    744 ///
    745 /// \param __a
    746 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    747 ///    32 bits of this operand are used in the comparison.
    748 /// \param __b
    749 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    750 ///    32 bits of this operand are used in the comparison.
    751 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    752 ///    in the low-order bits.
    753 static __inline__ __m128 __DEFAULT_FN_ATTRS
    754 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
    755 {
    756   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
    757 }
    758 
    759 /// \brief Compares each of the corresponding 32-bit float values of the
    760 ///    128-bit vectors of [4 x float] to determine if the values in the first
    761 ///    operand are not less than those in the second operand.
    762 ///
    763 /// \headerfile <x86intrin.h>
    764 ///
    765 /// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
    766 ///
    767 /// \param __a
    768 ///    A 128-bit vector of [4 x float].
    769 /// \param __b
    770 ///    A 128-bit vector of [4 x float].
    771 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    772 static __inline__ __m128 __DEFAULT_FN_ATTRS
    773 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
    774 {
    775   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
    776 }
    777 
    778 /// \brief Compares two 32-bit float values in the low-order bits of both
    779 ///    operands to determine if the value in the first operand is not less than
    780 ///    or equal to the corresponding value in the second operand and returns
    781 ///    the result of the comparison in the low-order bits of a vector of
    782 ///    [4 x float].
    783 ///
    784 /// \headerfile <x86intrin.h>
    785 ///
    786 /// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
    787 ///
    788 /// \param __a
    789 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    790 ///    32 bits of this operand are used in the comparison.
    791 /// \param __b
    792 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    793 ///    32 bits of this operand are used in the comparison.
    794 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    795 ///    in the low-order bits.
    796 static __inline__ __m128 __DEFAULT_FN_ATTRS
    797 _mm_cmpnle_ss(__m128 __a, __m128 __b)
    798 {
    799   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
    800 }
    801 
    802 /// \brief Compares each of the corresponding 32-bit float values of the
    803 ///    128-bit vectors of [4 x float] to determine if the values in the first
    804 ///    operand are not less than or equal to those in the second operand.
    805 ///
    806 /// \headerfile <x86intrin.h>
    807 ///
    808 /// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
    809 ///
    810 /// \param __a
    811 ///    A 128-bit vector of [4 x float].
    812 /// \param __b
    813 ///    A 128-bit vector of [4 x float].
    814 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    815 static __inline__ __m128 __DEFAULT_FN_ATTRS
    816 _mm_cmpnle_ps(__m128 __a, __m128 __b)
    817 {
    818   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
    819 }
    820 
    821 /// \brief Compares two 32-bit float values in the low-order bits of both
    822 ///    operands to determine if the value in the first operand is not greater
    823 ///    than the corresponding value in the second operand and returns the
    824 ///    result of the comparison in the low-order bits of a vector of
    825 ///    [4 x float].
    826 ///
    827 /// \headerfile <x86intrin.h>
    828 ///
    829 /// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
    830 ///
    831 /// \param __a
    832 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    833 ///    32 bits of this operand are used in the comparison.
    834 /// \param __b
    835 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    836 ///    32 bits of this operand are used in the comparison.
    837 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    838 ///    in the low-order bits.
    839 static __inline__ __m128 __DEFAULT_FN_ATTRS
    840 _mm_cmpngt_ss(__m128 __a, __m128 __b)
    841 {
    842   return (__m128)__builtin_shufflevector((__v4sf)__a,
    843                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
    844                                          4, 1, 2, 3);
    845 }
    846 
    847 /// \brief Compares each of the corresponding 32-bit float values of the
    848 ///    128-bit vectors of [4 x float] to determine if the values in the first
    849 ///    operand are not greater than those in the second operand.
    850 ///
    851 /// \headerfile <x86intrin.h>
    852 ///
    853 /// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
    854 ///
    855 /// \param __a
    856 ///    A 128-bit vector of [4 x float].
    857 /// \param __b
    858 ///    A 128-bit vector of [4 x float].
    859 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    860 static __inline__ __m128 __DEFAULT_FN_ATTRS
    861 _mm_cmpngt_ps(__m128 __a, __m128 __b)
    862 {
    863   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
    864 }
    865 
    866 /// \brief Compares two 32-bit float values in the low-order bits of both
    867 ///    operands to determine if the value in the first operand is not greater
    868 ///    than or equal to the corresponding value in the second operand and
    869 ///    returns the result of the comparison in the low-order bits of a vector
    870 ///    of [4 x float].
    871 ///
    872 /// \headerfile <x86intrin.h>
    873 ///
    874 /// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
    875 ///
    876 /// \param __a
    877 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    878 ///    32 bits of this operand are used in the comparison.
    879 /// \param __b
    880 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    881 ///    32 bits of this operand are used in the comparison.
    882 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    883 ///    in the low-order bits.
    884 static __inline__ __m128 __DEFAULT_FN_ATTRS
    885 _mm_cmpnge_ss(__m128 __a, __m128 __b)
    886 {
    887   return (__m128)__builtin_shufflevector((__v4sf)__a,
    888                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
    889                                          4, 1, 2, 3);
    890 }
    891 
    892 /// \brief Compares each of the corresponding 32-bit float values of the
    893 ///    128-bit vectors of [4 x float] to determine if the values in the first
    894 ///    operand are not greater than or equal to those in the second operand.
    895 ///
    896 /// \headerfile <x86intrin.h>
    897 ///
    898 /// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
    899 ///
    900 /// \param __a
    901 ///    A 128-bit vector of [4 x float].
    902 /// \param __b
    903 ///    A 128-bit vector of [4 x float].
    904 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    905 static __inline__ __m128 __DEFAULT_FN_ATTRS
    906 _mm_cmpnge_ps(__m128 __a, __m128 __b)
    907 {
    908   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
    909 }
    910 
    911 /// \brief Compares two 32-bit float values in the low-order bits of both
    912 ///    operands to determine if the value in the first operand is ordered with
    913 ///    respect to the corresponding value in the second operand and returns the
    914 ///    result of the comparison in the low-order bits of a vector of
    915 ///    [4 x float].
    916 ///
    917 /// \headerfile <x86intrin.h>
    918 ///
    919 /// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
    920 ///
    921 /// \param __a
    922 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    923 ///    32 bits of this operand are used in the comparison.
    924 /// \param __b
    925 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    926 ///    32 bits of this operand are used in the comparison.
    927 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    928 ///    in the low-order bits.
    929 static __inline__ __m128 __DEFAULT_FN_ATTRS
    930 _mm_cmpord_ss(__m128 __a, __m128 __b)
    931 {
    932   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
    933 }
    934 
    935 /// \brief Compares each of the corresponding 32-bit float values of the
    936 ///    128-bit vectors of [4 x float] to determine if the values in the first
    937 ///    operand are ordered with respect to those in the second operand.
    938 ///
    939 /// \headerfile <x86intrin.h>
    940 ///
    941 /// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
    942 ///
    943 /// \param __a
    944 ///    A 128-bit vector of [4 x float].
    945 /// \param __b
    946 ///    A 128-bit vector of [4 x float].
    947 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    948 static __inline__ __m128 __DEFAULT_FN_ATTRS
    949 _mm_cmpord_ps(__m128 __a, __m128 __b)
    950 {
    951   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
    952 }
    953 
    954 /// \brief Compares two 32-bit float values in the low-order bits of both
    955 ///    operands to determine if the value in the first operand is unordered
    956 ///    with respect to the corresponding value in the second operand and
    957 ///    returns the result of the comparison in the low-order bits of a vector
    958 ///    of [4 x float].
    959 ///
    960 /// \headerfile <x86intrin.h>
    961 ///
    962 /// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
    963 ///
    964 /// \param __a
    965 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    966 ///    32 bits of this operand are used in the comparison.
    967 /// \param __b
    968 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    969 ///    32 bits of this operand are used in the comparison.
    970 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    971 ///    in the low-order bits.
    972 static __inline__ __m128 __DEFAULT_FN_ATTRS
    973 _mm_cmpunord_ss(__m128 __a, __m128 __b)
    974 {
    975   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
    976 }
    977 
    978 /// \brief Compares each of the corresponding 32-bit float values of the
    979 ///    128-bit vectors of [4 x float] to determine if the values in the first
    980 ///    operand are unordered with respect to those in the second operand.
    981 ///
    982 /// \headerfile <x86intrin.h>
    983 ///
    984 /// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
    985 ///
    986 /// \param __a
    987 ///    A 128-bit vector of [4 x float].
    988 /// \param __b
    989 ///    A 128-bit vector of [4 x float].
    990 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    991 static __inline__ __m128 __DEFAULT_FN_ATTRS
    992 _mm_cmpunord_ps(__m128 __a, __m128 __b)
    993 {
    994   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
    995 }
    996 
    997 /// \brief Compares two 32-bit float values in the low-order bits of both
    998 ///    operands for equality and returns the result of the comparison.
    999 ///
   1000 /// \headerfile <x86intrin.h>
   1001 ///
   1002 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
   1003 ///
   1004 /// \param __a
   1005 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1006 ///    used in the comparison.
   1007 /// \param __b
   1008 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1009 ///    used in the comparison.
   1010 /// \returns An integer containing the comparison results.
   1011 static __inline__ int __DEFAULT_FN_ATTRS
   1012 _mm_comieq_ss(__m128 __a, __m128 __b)
   1013 {
   1014   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
   1015 }
   1016 
   1017 /// \brief Compares two 32-bit float values in the low-order bits of both
   1018 ///    operands to determine if the first operand is less than the second
   1019 ///    operand and returns the result of the comparison.
   1020 ///
   1021 /// \headerfile <x86intrin.h>
   1022 ///
   1023 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
   1024 ///
   1025 /// \param __a
   1026 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1027 ///    used in the comparison.
   1028 /// \param __b
   1029 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1030 ///    used in the comparison.
   1031 /// \returns An integer containing the comparison results.
   1032 static __inline__ int __DEFAULT_FN_ATTRS
   1033 _mm_comilt_ss(__m128 __a, __m128 __b)
   1034 {
   1035   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
   1036 }
   1037 
   1038 /// \brief Compares two 32-bit float values in the low-order bits of both
   1039 ///    operands to determine if the first operand is less than or equal to the
   1040 ///    second operand and returns the result of the comparison.
   1041 ///
   1042 /// \headerfile <x86intrin.h>
   1043 ///
   1044 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
   1045 ///
   1046 /// \param __a
   1047 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1048 ///    used in the comparison.
   1049 /// \param __b
   1050 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1051 ///    used in the comparison.
   1052 /// \returns An integer containing the comparison results.
   1053 static __inline__ int __DEFAULT_FN_ATTRS
   1054 _mm_comile_ss(__m128 __a, __m128 __b)
   1055 {
   1056   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
   1057 }
   1058 
   1059 /// \brief Compares two 32-bit float values in the low-order bits of both
   1060 ///    operands to determine if the first operand is greater than the second
   1061 ///    operand and returns the result of the comparison.
   1062 ///
   1063 /// \headerfile <x86intrin.h>
   1064 ///
   1065 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
   1066 ///
   1067 /// \param __a
   1068 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1069 ///    used in the comparison.
   1070 /// \param __b
   1071 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1072 ///    used in the comparison.
   1073 /// \returns An integer containing the comparison results.
   1074 static __inline__ int __DEFAULT_FN_ATTRS
   1075 _mm_comigt_ss(__m128 __a, __m128 __b)
   1076 {
   1077   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
   1078 }
   1079 
   1080 /// \brief Compares two 32-bit float values in the low-order bits of both
   1081 ///    operands to determine if the first operand is greater than or equal to
   1082 ///    the second operand and returns the result of the comparison.
   1083 ///
   1084 /// \headerfile <x86intrin.h>
   1085 ///
   1086 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
   1087 ///
   1088 /// \param __a
   1089 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1090 ///    used in the comparison.
   1091 /// \param __b
   1092 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1093 ///    used in the comparison.
   1094 /// \returns An integer containing the comparison results.
   1095 static __inline__ int __DEFAULT_FN_ATTRS
   1096 _mm_comige_ss(__m128 __a, __m128 __b)
   1097 {
   1098   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
   1099 }
   1100 
   1101 /// \brief Compares two 32-bit float values in the low-order bits of both
   1102 ///    operands to determine if the first operand is not equal to the second
   1103 ///    operand and returns the result of the comparison.
   1104 ///
   1105 /// \headerfile <x86intrin.h>
   1106 ///
   1107 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
   1108 ///
   1109 /// \param __a
   1110 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1111 ///    used in the comparison.
   1112 /// \param __b
   1113 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1114 ///    used in the comparison.
   1115 /// \returns An integer containing the comparison results.
   1116 static __inline__ int __DEFAULT_FN_ATTRS
   1117 _mm_comineq_ss(__m128 __a, __m128 __b)
   1118 {
   1119   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
   1120 }
   1121 
   1122 /// \brief Performs an unordered comparison of two 32-bit float values using
   1123 ///    the low-order bits of both operands to determine equality and returns
   1124 ///    the result of the comparison.
   1125 ///
   1126 /// \headerfile <x86intrin.h>
   1127 ///
   1128 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
   1129 ///
   1130 /// \param __a
   1131 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1132 ///    used in the comparison.
   1133 /// \param __b
   1134 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1135 ///    used in the comparison.
   1136 /// \returns An integer containing the comparison results.
   1137 static __inline__ int __DEFAULT_FN_ATTRS
   1138 _mm_ucomieq_ss(__m128 __a, __m128 __b)
   1139 {
   1140   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
   1141 }
   1142 
   1143 /// \brief Performs an unordered comparison of two 32-bit float values using
   1144 ///    the low-order bits of both operands to determine if the first operand is
   1145 ///    less than the second operand and returns the result of the comparison.
   1146 ///
   1147 /// \headerfile <x86intrin.h>
   1148 ///
   1149 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
   1150 ///
   1151 /// \param __a
   1152 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1153 ///    used in the comparison.
   1154 /// \param __b
   1155 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1156 ///    used in the comparison.
   1157 /// \returns An integer containing the comparison results.
   1158 static __inline__ int __DEFAULT_FN_ATTRS
   1159 _mm_ucomilt_ss(__m128 __a, __m128 __b)
   1160 {
   1161   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
   1162 }
   1163 
   1164 /// \brief Performs an unordered comparison of two 32-bit float values using
   1165 ///    the low-order bits of both operands to determine if the first operand
   1166 ///    is less than or equal to the second operand and returns the result of
   1167 ///    the comparison.
   1168 ///
   1169 /// \headerfile <x86intrin.h>
   1170 ///
   1171 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
   1172 ///
   1173 /// \param __a
   1174 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1175 ///    used in the comparison.
   1176 /// \param __b
   1177 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1178 ///    used in the comparison.
   1179 /// \returns An integer containing the comparison results.
   1180 static __inline__ int __DEFAULT_FN_ATTRS
   1181 _mm_ucomile_ss(__m128 __a, __m128 __b)
   1182 {
   1183   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
   1184 }
   1185 
   1186 /// \brief Performs an unordered comparison of two 32-bit float values using
   1187 ///    the low-order bits of both operands to determine if the first operand
   1188 ///    is greater than the second operand and returns the result of the
   1189 ///    comparison.
   1190 ///
   1191 /// \headerfile <x86intrin.h>
   1192 ///
   1193 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
   1194 ///
   1195 /// \param __a
   1196 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1197 ///    used in the comparison.
   1198 /// \param __b
   1199 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1200 ///    used in the comparison.
   1201 /// \returns An integer containing the comparison results.
   1202 static __inline__ int __DEFAULT_FN_ATTRS
   1203 _mm_ucomigt_ss(__m128 __a, __m128 __b)
   1204 {
   1205   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
   1206 }
   1207 
   1208 /// \brief Performs an unordered comparison of two 32-bit float values using
   1209 ///    the low-order bits of both operands to determine if the first operand is
   1210 ///    greater than or equal to the second operand and returns the result of
   1211 ///    the comparison.
   1212 ///
   1213 /// \headerfile <x86intrin.h>
   1214 ///
   1215 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
   1216 ///
   1217 /// \param __a
   1218 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1219 ///    used in the comparison.
   1220 /// \param __b
   1221 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1222 ///    used in the comparison.
   1223 /// \returns An integer containing the comparison results.
   1224 static __inline__ int __DEFAULT_FN_ATTRS
   1225 _mm_ucomige_ss(__m128 __a, __m128 __b)
   1226 {
   1227   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
   1228 }
   1229 
   1230 /// \brief Performs an unordered comparison of two 32-bit float values using
   1231 ///    the low-order bits of both operands to determine inequality and returns
   1232 ///    the result of the comparison.
   1233 ///
   1234 /// \headerfile <x86intrin.h>
   1235 ///
   1236 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
   1237 ///
   1238 /// \param __a
   1239 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1240 ///    used in the comparison.
   1241 /// \param __b
   1242 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1243 ///    used in the comparison.
   1244 /// \returns An integer containing the comparison results.
   1245 static __inline__ int __DEFAULT_FN_ATTRS
   1246 _mm_ucomineq_ss(__m128 __a, __m128 __b)
   1247 {
   1248   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
   1249 }
   1250 
   1251 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1252 ///    [4 x float] into a 32-bit integer.
   1253 ///
   1254 /// \headerfile <x86intrin.h>
   1255 ///
   1256 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
   1257 ///
   1258 /// \param __a
   1259 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1260 ///    used in the conversion.
   1261 /// \returns A 32-bit integer containing the converted value.
   1262 static __inline__ int __DEFAULT_FN_ATTRS
   1263 _mm_cvtss_si32(__m128 __a)
   1264 {
   1265   return __builtin_ia32_cvtss2si((__v4sf)__a);
   1266 }
   1267 
   1268 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1269 ///    [4 x float] into a 32-bit integer.
   1270 ///
   1271 /// \headerfile <x86intrin.h>
   1272 ///
   1273 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
   1274 ///
   1275 /// \param __a
   1276 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1277 ///    used in the conversion.
   1278 /// \returns A 32-bit integer containing the converted value.
   1279 static __inline__ int __DEFAULT_FN_ATTRS
   1280 _mm_cvt_ss2si(__m128 __a)
   1281 {
   1282   return _mm_cvtss_si32(__a);
   1283 }
   1284 
   1285 #ifdef __x86_64__
   1286 
   1287 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1288 ///    [4 x float] into a 64-bit integer.
   1289 ///
   1290 /// \headerfile <x86intrin.h>
   1291 ///
   1292 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
   1293 ///
   1294 /// \param __a
   1295 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1296 ///    used in the conversion.
   1297 /// \returns A 64-bit integer containing the converted value.
   1298 static __inline__ long long __DEFAULT_FN_ATTRS
   1299 _mm_cvtss_si64(__m128 __a)
   1300 {
   1301   return __builtin_ia32_cvtss2si64((__v4sf)__a);
   1302 }
   1303 
   1304 #endif
   1305 
   1306 /// \brief Converts two low-order float values in a 128-bit vector of
   1307 ///    [4 x float] into a 64-bit vector of [2 x i32].
   1308 ///
   1309 /// \headerfile <x86intrin.h>
   1310 ///
   1311 /// This intrinsic corresponds to the \c CVTPS2PI instruction.
   1312 ///
   1313 /// \param __a
   1314 ///    A 128-bit vector of [4 x float].
   1315 /// \returns A 64-bit integer vector containing the converted values.
   1316 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1317 _mm_cvtps_pi32(__m128 __a)
   1318 {
   1319   return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
   1320 }
   1321 
   1322 /// \brief Converts two low-order float values in a 128-bit vector of
   1323 ///    [4 x float] into a 64-bit vector of [2 x i32].
   1324 ///
   1325 /// \headerfile <x86intrin.h>
   1326 ///
   1327 /// This intrinsic corresponds to the \c CVTPS2PI instruction.
   1328 ///
   1329 /// \param __a
   1330 ///    A 128-bit vector of [4 x float].
   1331 /// \returns A 64-bit integer vector containing the converted values.
   1332 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1333 _mm_cvt_ps2pi(__m128 __a)
   1334 {
   1335   return _mm_cvtps_pi32(__a);
   1336 }
   1337 
   1338 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1339 ///    [4 x float] into a 32-bit integer, truncating the result when it is
   1340 ///    inexact.
   1341 ///
   1342 /// \headerfile <x86intrin.h>
   1343 ///
   1344 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
   1345 ///
   1346 /// \param __a
   1347 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1348 ///    used in the conversion.
   1349 /// \returns A 32-bit integer containing the converted value.
   1350 static __inline__ int __DEFAULT_FN_ATTRS
   1351 _mm_cvttss_si32(__m128 __a)
   1352 {
   1353   return __a[0];
   1354 }
   1355 
   1356 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1357 ///    [4 x float] into a 32-bit integer, truncating the result when it is
   1358 ///    inexact.
   1359 ///
   1360 /// \headerfile <x86intrin.h>
   1361 ///
   1362 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
   1363 ///
   1364 /// \param __a
   1365 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1366 ///    used in the conversion.
   1367 /// \returns A 32-bit integer containing the converted value.
   1368 static __inline__ int __DEFAULT_FN_ATTRS
   1369 _mm_cvtt_ss2si(__m128 __a)
   1370 {
   1371   return _mm_cvttss_si32(__a);
   1372 }
   1373 
   1374 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1375 ///    [4 x float] into a 64-bit integer, truncating the result when it is
   1376 ///    inexact.
   1377 ///
   1378 /// \headerfile <x86intrin.h>
   1379 ///
   1380 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
   1381 ///
   1382 /// \param __a
   1383 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1384 ///    used in the conversion.
   1385 /// \returns A 64-bit integer containing the converted value.
   1386 static __inline__ long long __DEFAULT_FN_ATTRS
   1387 _mm_cvttss_si64(__m128 __a)
   1388 {
   1389   return __a[0];
   1390 }
   1391 
   1392 /// \brief Converts two low-order float values in a 128-bit vector of
   1393 ///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
   1394 ///    when it is inexact.
   1395 ///
   1396 /// \headerfile <x86intrin.h>
   1397 ///
   1398 /// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
   1399 ///
   1400 /// \param __a
   1401 ///    A 128-bit vector of [4 x float].
   1402 /// \returns A 64-bit integer vector containing the converted values.
   1403 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1404 _mm_cvttps_pi32(__m128 __a)
   1405 {
   1406   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
   1407 }
   1408 
   1409 /// \brief Converts two low-order float values in a 128-bit vector of [4 x
   1410 ///    float] into a 64-bit vector of [2 x i32], truncating the result when it
   1411 ///    is inexact.
   1412 ///
   1413 /// \headerfile <x86intrin.h>
   1414 ///
   1415 /// This intrinsic corresponds to the \c CVTTPS2PI instruction.
   1416 ///
   1417 /// \param __a
   1418 ///    A 128-bit vector of [4 x float].
   1419 /// \returns A 64-bit integer vector containing the converted values.
   1420 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1421 _mm_cvtt_ps2pi(__m128 __a)
   1422 {
   1423   return _mm_cvttps_pi32(__a);
   1424 }
   1425 
   1426 /// \brief Converts a 32-bit signed integer value into a floating point value
   1427 ///    and writes it to the lower 32 bits of the destination. The remaining
   1428 ///    higher order elements of the destination vector are copied from the
   1429 ///    corresponding elements in the first operand.
   1430 ///
   1431 /// \headerfile <x86intrin.h>
   1432 ///
   1433 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
   1434 ///
   1435 /// \param __a
   1436 ///    A 128-bit vector of [4 x float].
   1437 /// \param __b
   1438 ///    A 32-bit signed integer operand containing the value to be converted.
   1439 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
   1440 ///    converted value of the second operand. The upper 96 bits are copied from
   1441 ///    the upper 96 bits of the first operand.
   1442 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1443 _mm_cvtsi32_ss(__m128 __a, int __b)
   1444 {
   1445   __a[0] = __b;
   1446   return __a;
   1447 }
   1448 
   1449 /// \brief Converts a 32-bit signed integer value into a floating point value
   1450 ///    and writes it to the lower 32 bits of the destination. The remaining
   1451 ///    higher order elements of the destination are copied from the
   1452 ///    corresponding elements in the first operand.
   1453 ///
   1454 /// \headerfile <x86intrin.h>
   1455 ///
   1456 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
   1457 ///
   1458 /// \param __a
   1459 ///    A 128-bit vector of [4 x float].
   1460 /// \param __b
   1461 ///    A 32-bit signed integer operand containing the value to be converted.
   1462 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
   1463 ///    converted value of the second operand. The upper 96 bits are copied from
   1464 ///    the upper 96 bits of the first operand.
   1465 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1466 _mm_cvt_si2ss(__m128 __a, int __b)
   1467 {
   1468   return _mm_cvtsi32_ss(__a, __b);
   1469 }
   1470 
   1471 #ifdef __x86_64__
   1472 
   1473 /// \brief Converts a 64-bit signed integer value into a floating point value
   1474 ///    and writes it to the lower 32 bits of the destination. The remaining
   1475 ///    higher order elements of the destination are copied from the
   1476 ///    corresponding elements in the first operand.
   1477 ///
   1478 /// \headerfile <x86intrin.h>
   1479 ///
   1480 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
   1481 ///
   1482 /// \param __a
   1483 ///    A 128-bit vector of [4 x float].
   1484 /// \param __b
   1485 ///    A 64-bit signed integer operand containing the value to be converted.
   1486 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
   1487 ///    converted value of the second operand. The upper 96 bits are copied from
   1488 ///    the upper 96 bits of the first operand.
   1489 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1490 _mm_cvtsi64_ss(__m128 __a, long long __b)
   1491 {
   1492   __a[0] = __b;
   1493   return __a;
   1494 }
   1495 
   1496 #endif
   1497 
   1498 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
   1499 ///    floating point values and writes them to the lower 64-bits of the
   1500 ///    destination. The remaining higher order elements of the destination are
   1501 ///    copied from the corresponding elements in the first operand.
   1502 ///
   1503 /// \headerfile <x86intrin.h>
   1504 ///
   1505 /// This intrinsic corresponds to the \c CVTPI2PS instruction.
   1506 ///
   1507 /// \param __a
   1508 ///    A 128-bit vector of [4 x float].
   1509 /// \param __b
   1510 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
   1511 ///    and written to the corresponding low-order elements in the destination.
   1512 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
   1513 ///    converted value of the second operand. The upper 64 bits are copied from
   1514 ///    the upper 64 bits of the first operand.
   1515 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1516 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
   1517 {
   1518   return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
   1519 }
   1520 
   1521 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
   1522 ///    floating point values and writes them to the lower 64-bits of the
   1523 ///    destination. The remaining higher order elements of the destination are
   1524 ///    copied from the corresponding elements in the first operand.
   1525 ///
   1526 /// \headerfile <x86intrin.h>
   1527 ///
   1528 /// This intrinsic corresponds to the \c CVTPI2PS instruction.
   1529 ///
   1530 /// \param __a
   1531 ///    A 128-bit vector of [4 x float].
   1532 /// \param __b
   1533 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
   1534 ///    and written to the corresponding low-order elements in the destination.
   1535 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
   1536 ///    converted value from the second operand. The upper 64 bits are copied
   1537 ///    from the upper 64 bits of the first operand.
   1538 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1539 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
   1540 {
   1541   return _mm_cvtpi32_ps(__a, __b);
   1542 }
   1543 
   1544 /// \brief Extracts a float value contained in the lower 32 bits of a vector of
   1545 ///    [4 x float].
   1546 ///
   1547 /// \headerfile <x86intrin.h>
   1548 ///
   1549 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
   1550 ///
   1551 /// \param __a
   1552 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1553 ///    used in the extraction.
   1554 /// \returns A 32-bit float containing the extracted value.
   1555 static __inline__ float __DEFAULT_FN_ATTRS
   1556 _mm_cvtss_f32(__m128 __a)
   1557 {
   1558   return __a[0];
   1559 }
   1560 
   1561 /// \brief Loads two packed float values from the address __p into the
   1562 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
   1563 ///     are copied from the low-order bits of the first operand.
   1564 ///
   1565 /// \headerfile <x86intrin.h>
   1566 ///
   1567 /// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
   1568 ///
   1569 /// \param __a
   1570 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
   1571 ///    of the destination.
   1572 /// \param __p
   1573 ///    A pointer to two packed float values. Bits [63:0] are written to bits
   1574 ///    [127:64] of the destination.
   1575 /// \returns A 128-bit vector of [4 x float] containing the moved values.
   1576 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1577 _mm_loadh_pi(__m128 __a, const __m64 *__p)
   1578 {
   1579   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
   1580   struct __mm_loadh_pi_struct {
   1581     __mm_loadh_pi_v2f32 __u;
   1582   } __attribute__((__packed__, __may_alias__));
   1583   __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
   1584   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
   1585   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
   1586 }
   1587 
   1588 /// \brief Loads two packed float values from the address __p into the low-order
   1589 ///    bits of a 128-bit vector of [4 x float]. The high-order bits are copied
   1590 ///    from the high-order bits of the first operand.
   1591 ///
   1592 /// \headerfile <x86intrin.h>
   1593 ///
   1594 /// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
   1595 ///
   1596 /// \param __a
   1597 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
   1598 ///    [127:64] of the destination.
   1599 /// \param __p
   1600 ///    A pointer to two packed float values. Bits [63:0] are written to bits
   1601 ///    [63:0] of the destination.
   1602 /// \returns A 128-bit vector of [4 x float] containing the moved values.
   1603 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1604 _mm_loadl_pi(__m128 __a, const __m64 *__p)
   1605 {
   1606   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
   1607   struct __mm_loadl_pi_struct {
   1608     __mm_loadl_pi_v2f32 __u;
   1609   } __attribute__((__packed__, __may_alias__));
   1610   __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
   1611   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
   1612   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
   1613 }
   1614 
   1615 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
   1616 ///    32 bits of the vector are initialized with the single-precision
   1617 ///    floating-point value loaded from a specified memory location. The upper
   1618 ///    96 bits are set to zero.
   1619 ///
   1620 /// \headerfile <x86intrin.h>
   1621 ///
   1622 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
   1623 ///
   1624 /// \param __p
   1625 ///    A pointer to a 32-bit memory location containing a single-precision
   1626 ///    floating-point value.
   1627 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
   1628 ///    lower 32 bits contain the value loaded from the memory location. The
   1629 ///    upper 96 bits are set to zero.
   1630 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1631 _mm_load_ss(const float *__p)
   1632 {
   1633   struct __mm_load_ss_struct {
   1634     float __u;
   1635   } __attribute__((__packed__, __may_alias__));
   1636   float __u = ((struct __mm_load_ss_struct*)__p)->__u;
   1637   return (__m128){ __u, 0, 0, 0 };
   1638 }
   1639 
   1640 /// \brief Loads a 32-bit float value and duplicates it to all four vector
   1641 ///    elements of a 128-bit vector of [4 x float].
   1642 ///
   1643 /// \headerfile <x86intrin.h>
   1644 ///
   1645 /// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling
   1646 ///    instruction.
   1647 ///
   1648 /// \param __p
   1649 ///    A pointer to a float value to be loaded and duplicated.
   1650 /// \returns A 128-bit vector of [4 x float] containing the loaded
   1651 ///    and duplicated values.
   1652 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1653 _mm_load1_ps(const float *__p)
   1654 {
   1655   struct __mm_load1_ps_struct {
   1656     float __u;
   1657   } __attribute__((__packed__, __may_alias__));
   1658   float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
   1659   return (__m128){ __u, __u, __u, __u };
   1660 }
   1661 
   1662 #define        _mm_load_ps1(p) _mm_load1_ps(p)
   1663 
   1664 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
   1665 ///    memory location.
   1666 ///
   1667 /// \headerfile <x86intrin.h>
   1668 ///
   1669 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
   1670 ///
   1671 /// \param __p
   1672 ///    A pointer to a 128-bit memory location. The address of the memory
   1673 ///    location has to be 128-bit aligned.
   1674 /// \returns A 128-bit vector of [4 x float] containing the loaded valus.
   1675 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1676 _mm_load_ps(const float *__p)
   1677 {
   1678   return *(__m128*)__p;
   1679 }
   1680 
   1681 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an
   1682 ///    unaligned memory location.
   1683 ///
   1684 /// \headerfile <x86intrin.h>
   1685 ///
   1686 /// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
   1687 ///
   1688 /// \param __p
   1689 ///    A pointer to a 128-bit memory location. The address of the memory
   1690 ///    location does not have to be aligned.
   1691 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
   1692 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1693 _mm_loadu_ps(const float *__p)
   1694 {
   1695   struct __loadu_ps {
   1696     __m128 __v;
   1697   } __attribute__((__packed__, __may_alias__));
   1698   return ((struct __loadu_ps*)__p)->__v;
   1699 }
   1700 
   1701 /// \brief Loads four packed float values, in reverse order, from an aligned
   1702 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
   1703 ///
   1704 /// \headerfile <x86intrin.h>
   1705 ///
   1706 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
   1707 ///    instruction.
   1708 ///
   1709 /// \param __p
   1710 ///    A pointer to a 128-bit memory location. The address of the memory
   1711 ///    location has to be 128-bit aligned.
   1712 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
   1713 ///    in reverse order.
   1714 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1715 _mm_loadr_ps(const float *__p)
   1716 {
   1717   __m128 __a = _mm_load_ps(__p);
   1718   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
   1719 }
   1720 
   1721 /// \brief Create a 128-bit vector of [4 x float] with undefined values.
   1722 ///
   1723 /// \headerfile <x86intrin.h>
   1724 ///
   1725 /// This intrinsic has no corresponding instruction.
   1726 ///
   1727 /// \returns A 128-bit vector of [4 x float] containing undefined values.
   1728 
   1729 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1730 _mm_undefined_ps(void)
   1731 {
   1732   return (__m128)__builtin_ia32_undef128();
   1733 }
   1734 
   1735 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
   1736 ///    32 bits of the vector are initialized with the specified single-precision
   1737 ///    floating-point value. The upper 96 bits are set to zero.
   1738 ///
   1739 /// \headerfile <x86intrin.h>
   1740 ///
   1741 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
   1742 ///
   1743 /// \param __w
   1744 ///    A single-precision floating-point value used to initialize the lower 32
   1745 ///    bits of the result.
   1746 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
   1747 ///    lower 32 bits contain the value provided in the source operand. The
   1748 ///    upper 96 bits are set to zero.
   1749 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1750 _mm_set_ss(float __w)
   1751 {
   1752   return (__m128){ __w, 0, 0, 0 };
   1753 }
   1754 
   1755 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
   1756 ///    of the four single-precision floating-point vector elements set to the
   1757 ///    specified single-precision floating-point value.
   1758 ///
   1759 /// \headerfile <x86intrin.h>
   1760 ///
   1761 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
   1762 ///
   1763 /// \param __w
   1764 ///    A single-precision floating-point value used to initialize each vector
   1765 ///    element of the result.
   1766 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1767 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1768 _mm_set1_ps(float __w)
   1769 {
   1770   return (__m128){ __w, __w, __w, __w };
   1771 }
   1772 
   1773 /* Microsoft specific. */
   1774 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
   1775 ///    of the four single-precision floating-point vector elements set to the
   1776 ///    specified single-precision floating-point value.
   1777 ///
   1778 /// \headerfile <x86intrin.h>
   1779 ///
   1780 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
   1781 ///
   1782 /// \param __w
   1783 ///    A single-precision floating-point value used to initialize each vector
   1784 ///    element of the result.
   1785 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1786 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1787 _mm_set_ps1(float __w)
   1788 {
   1789     return _mm_set1_ps(__w);
   1790 }
   1791 
   1792 /// \brief Constructs a 128-bit floating-point vector of [4 x float]
   1793 ///    initialized with the specified single-precision floating-point values.
   1794 ///
   1795 /// \headerfile <x86intrin.h>
   1796 ///
   1797 /// This intrinsic is a utility function and does not correspond to a specific
   1798 ///    instruction.
   1799 ///
   1800 /// \param __z
   1801 ///    A single-precision floating-point value used to initialize bits [127:96]
   1802 ///    of the result.
   1803 /// \param __y
   1804 ///    A single-precision floating-point value used to initialize bits [95:64]
   1805 ///    of the result.
   1806 /// \param __x
   1807 ///    A single-precision floating-point value used to initialize bits [63:32]
   1808 ///    of the result.
   1809 /// \param __w
   1810 ///    A single-precision floating-point value used to initialize bits [31:0]
   1811 ///    of the result.
   1812 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1813 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1814 _mm_set_ps(float __z, float __y, float __x, float __w)
   1815 {
   1816   return (__m128){ __w, __x, __y, __z };
   1817 }
   1818 
   1819 /// \brief Constructs a 128-bit floating-point vector of [4 x float],
   1820 ///    initialized in reverse order with the specified 32-bit single-precision
   1821 ///    float-point values.
   1822 ///
   1823 /// \headerfile <x86intrin.h>
   1824 ///
   1825 /// This intrinsic is a utility function and does not correspond to a specific
   1826 ///    instruction.
   1827 ///
   1828 /// \param __z
   1829 ///    A single-precision floating-point value used to initialize bits [31:0]
   1830 ///    of the result.
   1831 /// \param __y
   1832 ///    A single-precision floating-point value used to initialize bits [63:32]
   1833 ///    of the result.
   1834 /// \param __x
   1835 ///    A single-precision floating-point value used to initialize bits [95:64]
   1836 ///    of the result.
   1837 /// \param __w
   1838 ///    A single-precision floating-point value used to initialize bits [127:96]
   1839 ///    of the result.
   1840 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1841 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1842 _mm_setr_ps(float __z, float __y, float __x, float __w)
   1843 {
   1844   return (__m128){ __z, __y, __x, __w };
   1845 }
   1846 
   1847 /// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
   1848 ///    to zero.
   1849 ///
   1850 /// \headerfile <x86intrin.h>
   1851 ///
   1852 /// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
   1853 ///
   1854 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
   1855 ///    all elements set to zero.
   1856 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1857 _mm_setzero_ps(void)
   1858 {
   1859   return (__m128){ 0, 0, 0, 0 };
   1860 }
   1861 
   1862 /// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
   1863 ///    memory location.
   1864 ///
   1865 /// \headerfile <x86intrin.h>
   1866 ///
   1867 /// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction.
   1868 ///
   1869 /// \param __p
   1870 ///    A pointer to a 64-bit memory location.
   1871 /// \param __a
   1872 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   1873 static __inline__ void __DEFAULT_FN_ATTRS
   1874 _mm_storeh_pi(__m64 *__p, __m128 __a)
   1875 {
   1876   __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
   1877 }
   1878 
   1879 /// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
   1880 ///     memory location.
   1881 ///
   1882 /// \headerfile <x86intrin.h>
   1883 ///
   1884 /// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
   1885 ///
   1886 /// \param __p
   1887 ///    A pointer to a memory location that will receive the float values.
   1888 /// \param __a
   1889 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   1890 static __inline__ void __DEFAULT_FN_ATTRS
   1891 _mm_storel_pi(__m64 *__p, __m128 __a)
   1892 {
   1893   __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
   1894 }
   1895 
   1896 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
   1897 ///     memory location.
   1898 ///
   1899 /// \headerfile <x86intrin.h>
   1900 ///
   1901 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
   1902 ///
   1903 /// \param __p
   1904 ///    A pointer to a 32-bit memory location.
   1905 /// \param __a
   1906 ///    A 128-bit vector of [4 x float] containing the value to be stored.
   1907 static __inline__ void __DEFAULT_FN_ATTRS
   1908 _mm_store_ss(float *__p, __m128 __a)
   1909 {
   1910   struct __mm_store_ss_struct {
   1911     float __u;
   1912   } __attribute__((__packed__, __may_alias__));
   1913   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
   1914 }
   1915 
   1916 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
   1917 ///    unaligned memory location.
   1918 ///
   1919 /// \headerfile <x86intrin.h>
   1920 ///
   1921 /// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
   1922 ///
   1923 /// \param __p
   1924 ///    A pointer to a 128-bit memory location. The address of the memory
   1925 ///    location does not have to be aligned.
   1926 /// \param __a
   1927 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   1928 static __inline__ void __DEFAULT_FN_ATTRS
   1929 _mm_storeu_ps(float *__p, __m128 __a)
   1930 {
   1931   struct __storeu_ps {
   1932     __m128 __v;
   1933   } __attribute__((__packed__, __may_alias__));
   1934   ((struct __storeu_ps*)__p)->__v = __a;
   1935 }
   1936 
   1937 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
   1938 ///    four contiguous elements in an aligned memory location.
   1939 ///
   1940 /// \headerfile <x86intrin.h>
   1941 ///
   1942 /// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
   1943 ///    instruction.
   1944 ///
   1945 /// \param __p
   1946 ///    A pointer to a 128-bit memory location.
   1947 /// \param __a
   1948 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
   1949 ///    of the four contiguous elements pointed by __p.
   1950 static __inline__ void __DEFAULT_FN_ATTRS
   1951 _mm_store_ps(float *__p, __m128 __a)
   1952 {
   1953   *(__m128*)__p = __a;
   1954 }
   1955 
   1956 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
   1957 ///    four contiguous elements in an aligned memory location.
   1958 ///
   1959 /// \headerfile <x86intrin.h>
   1960 ///
   1961 /// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
   1962 ///    instruction.
   1963 ///
   1964 /// \param __p
   1965 ///    A pointer to a 128-bit memory location.
   1966 /// \param __a
   1967 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
   1968 ///    of the four contiguous elements pointed by __p.
   1969 static __inline__ void __DEFAULT_FN_ATTRS
   1970 _mm_store1_ps(float *__p, __m128 __a)
   1971 {
   1972   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
   1973   _mm_store_ps(__p, __a);
   1974 }
   1975 
   1976 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
   1977 ///    aligned memory location.
   1978 ///
   1979 /// \headerfile <x86intrin.h>
   1980 ///
   1981 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
   1982 ///
   1983 /// \param __p
   1984 ///    A pointer to a 128-bit memory location. The address of the memory
   1985 ///    location has to be 128-bit aligned.
   1986 /// \param __a
   1987 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   1988 static __inline__ void __DEFAULT_FN_ATTRS
   1989 _mm_store_ps1(float *__p, __m128 __a)
   1990 {
   1991   return _mm_store1_ps(__p, __a);
   1992 }
   1993 
   1994 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
   1995 ///    aligned memory location in reverse order.
   1996 ///
   1997 /// \headerfile <x86intrin.h>
   1998 ///
   1999 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
   2000 ///    instruction.
   2001 ///
   2002 /// \param __p
   2003 ///    A pointer to a 128-bit memory location. The address of the memory
   2004 ///    location has to be 128-bit aligned.
   2005 /// \param __a
   2006 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   2007 static __inline__ void __DEFAULT_FN_ATTRS
   2008 _mm_storer_ps(float *__p, __m128 __a)
   2009 {
   2010   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
   2011   _mm_store_ps(__p, __a);
   2012 }
   2013 
   2014 #define _MM_HINT_T0 3
   2015 #define _MM_HINT_T1 2
   2016 #define _MM_HINT_T2 1
   2017 #define _MM_HINT_NTA 0
   2018 
   2019 #ifndef _MSC_VER
   2020 /* FIXME: We have to #define this because "sel" must be a constant integer, and
   2021    Sema doesn't do any form of constant propagation yet. */
   2022 
   2023 /// \brief Loads one cache line of data from the specified address to a location
   2024 ///    closer to the processor.
   2025 ///
   2026 /// \headerfile <x86intrin.h>
   2027 ///
   2028 /// \code
   2029 /// void _mm_prefetch(const void * a, const int sel);
   2030 /// \endcode
   2031 ///
   2032 /// This intrinsic corresponds to the \c PREFETCHNTA instruction.
   2033 ///
   2034 /// \param a
   2035 ///    A pointer to a memory location containing a cache line of data.
   2036 /// \param sel
   2037 ///    A predefined integer constant specifying the type of prefetch operation:
   2038 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint.
   2039 ///    The PREFETCHNTA instruction will be generated.
   2040 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
   2041 ///    be generated.
   2042 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
   2043 ///    be generated.
   2044 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
   2045 ///    be generated.
   2046 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
   2047 #endif
   2048 
   2049 /// \brief Stores a 64-bit integer in the specified aligned memory location. To
   2050 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
   2051 ///    used again soon).
   2052 ///
   2053 /// \headerfile <x86intrin.h>
   2054 ///
   2055 /// This intrinsic corresponds to the \c MOVNTQ instruction.
   2056 ///
   2057 /// \param __p
   2058 ///    A pointer to an aligned memory location used to store the register value.
   2059 /// \param __a
   2060 ///    A 64-bit integer containing the value to be stored.
   2061 static __inline__ void __DEFAULT_FN_ATTRS
   2062 _mm_stream_pi(__m64 *__p, __m64 __a)
   2063 {
   2064   __builtin_ia32_movntq(__p, __a);
   2065 }
   2066 
   2067 /// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
   2068 ///    128-bit aligned memory location. To minimize caching, the data is flagged
   2069 ///    as non-temporal (unlikely to be used again soon).
   2070 ///
   2071 /// \headerfile <x86intrin.h>
   2072 ///
   2073 /// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
   2074 ///
   2075 /// \param __p
   2076 ///    A pointer to a 128-bit aligned memory location that will receive the
   2077 ///    integer values.
   2078 /// \param __a
   2079 ///    A 128-bit vector of [4 x float] containing the values to be moved.
   2080 static __inline__ void __DEFAULT_FN_ATTRS
   2081 _mm_stream_ps(float *__p, __m128 __a)
   2082 {
   2083   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
   2084 }
   2085 
   2086 /// \brief Forces strong memory ordering (serialization) between store
   2087 ///    instructions preceding this instruction and store instructions following
   2088 ///    this instruction, ensuring the system completes all previous stores
   2089 ///    before executing subsequent stores.
   2090 ///
   2091 /// \headerfile <x86intrin.h>
   2092 ///
   2093 /// This intrinsic corresponds to the \c SFENCE instruction.
   2094 ///
   2095 static __inline__ void __DEFAULT_FN_ATTRS
   2096 _mm_sfence(void)
   2097 {
   2098   __builtin_ia32_sfence();
   2099 }
   2100 
   2101 /// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
   2102 ///    returns it, as specified by the immediate integer operand.
   2103 ///
   2104 /// \headerfile <x86intrin.h>
   2105 ///
   2106 /// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
   2107 ///
   2108 /// \param __a
   2109 ///    A 64-bit vector of [4 x i16].
   2110 /// \param __n
   2111 ///    An immediate integer operand that determines which bits are extracted:
   2112 ///    0: Bits [15:0] are copied to the destination.
   2113 ///    1: Bits [31:16] are copied to the destination.
   2114 ///    2: Bits [47:32] are copied to the destination.
   2115 ///    3: Bits [63:48] are copied to the destination.
   2116 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
   2117 #define _mm_extract_pi16(a, n) __extension__ ({ \
   2118   (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
   2119 
   2120 /// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
   2121 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
   2122 ///    specified by the immediate operand __n.
   2123 ///
   2124 /// \headerfile <x86intrin.h>
   2125 ///
   2126 /// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
   2127 ///
   2128 /// \param __a
   2129 ///    A 64-bit vector of [4 x i16].
   2130 /// \param __d
   2131 ///    An integer. The lower 16-bit value from this operand is written to the
   2132 ///    destination at the offset specified by operand __n.
   2133 /// \param __n
   2134 ///    An immediate integer operant that determines which the bits to be used
   2135 ///    in the destination.
   2136 ///    0: Bits [15:0] are copied to the destination.
   2137 ///    1: Bits [31:16] are copied to the destination.
   2138 ///    2: Bits [47:32] are copied to the destination.
   2139 ///    3: Bits [63:48] are copied to the destination.
   2140 ///    The remaining bits in the destination are copied from the corresponding
   2141 ///    bits in operand __a.
   2142 /// \returns A 64-bit integer vector containing the copied packed data from the
   2143 ///    operands.
   2144 #define _mm_insert_pi16(a, d, n) __extension__ ({ \
   2145   (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
   2146 
   2147 /// \brief Compares each of the corresponding packed 16-bit integer values of
   2148 ///    the 64-bit integer vectors, and writes the greater value to the
   2149 ///    corresponding bits in the destination.
   2150 ///
   2151 /// \headerfile <x86intrin.h>
   2152 ///
   2153 /// This intrinsic corresponds to the \c PMAXSW instruction.
   2154 ///
   2155 /// \param __a
   2156 ///    A 64-bit integer vector containing one of the source operands.
   2157 /// \param __b
   2158 ///    A 64-bit integer vector containing one of the source operands.
   2159 /// \returns A 64-bit integer vector containing the comparison results.
   2160 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2161 _mm_max_pi16(__m64 __a, __m64 __b)
   2162 {
   2163   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
   2164 }
   2165 
   2166 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
   2167 ///    values of the 64-bit integer vectors, and writes the greater value to the
   2168 ///    corresponding bits in the destination.
   2169 ///
   2170 /// \headerfile <x86intrin.h>
   2171 ///
   2172 /// This intrinsic corresponds to the \c PMAXUB instruction.
   2173 ///
   2174 /// \param __a
   2175 ///    A 64-bit integer vector containing one of the source operands.
   2176 /// \param __b
   2177 ///    A 64-bit integer vector containing one of the source operands.
   2178 /// \returns A 64-bit integer vector containing the comparison results.
   2179 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2180 _mm_max_pu8(__m64 __a, __m64 __b)
   2181 {
   2182   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
   2183 }
   2184 
   2185 /// \brief Compares each of the corresponding packed 16-bit integer values of
   2186 ///    the 64-bit integer vectors, and writes the lesser value to the
   2187 ///    corresponding bits in the destination.
   2188 ///
   2189 /// \headerfile <x86intrin.h>
   2190 ///
   2191 /// This intrinsic corresponds to the \c PMINSW instruction.
   2192 ///
   2193 /// \param __a
   2194 ///    A 64-bit integer vector containing one of the source operands.
   2195 /// \param __b
   2196 ///    A 64-bit integer vector containing one of the source operands.
   2197 /// \returns A 64-bit integer vector containing the comparison results.
   2198 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2199 _mm_min_pi16(__m64 __a, __m64 __b)
   2200 {
   2201   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
   2202 }
   2203 
   2204 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
   2205 ///    values of the 64-bit integer vectors, and writes the lesser value to the
   2206 ///    corresponding bits in the destination.
   2207 ///
   2208 /// \headerfile <x86intrin.h>
   2209 ///
   2210 /// This intrinsic corresponds to the \c PMINUB instruction.
   2211 ///
   2212 /// \param __a
   2213 ///    A 64-bit integer vector containing one of the source operands.
   2214 /// \param __b
   2215 ///    A 64-bit integer vector containing one of the source operands.
   2216 /// \returns A 64-bit integer vector containing the comparison results.
   2217 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2218 _mm_min_pu8(__m64 __a, __m64 __b)
   2219 {
   2220   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
   2221 }
   2222 
   2223 /// \brief Takes the most significant bit from each 8-bit element in a 64-bit
   2224 ///    integer vector to create a 16-bit mask value. Zero-extends the value to
   2225 ///    32-bit integer and writes it to the destination.
   2226 ///
   2227 /// \headerfile <x86intrin.h>
   2228 ///
   2229 /// This intrinsic corresponds to the \c PMOVMSKB instruction.
   2230 ///
   2231 /// \param __a
   2232 ///    A 64-bit integer vector containing the values with bits to be extracted.
   2233 /// \returns The most significant bit from each 8-bit element in the operand,
   2234 ///    written to bits [15:0].
   2235 static __inline__ int __DEFAULT_FN_ATTRS
   2236 _mm_movemask_pi8(__m64 __a)
   2237 {
   2238   return __builtin_ia32_pmovmskb((__v8qi)__a);
   2239 }
   2240 
   2241 /// \brief Multiplies packed 16-bit unsigned integer values and writes the
   2242 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
   2243 ///    the destination.
   2244 ///
   2245 /// \headerfile <x86intrin.h>
   2246 ///
   2247 /// This intrinsic corresponds to the \c PMULHUW instruction.
   2248 ///
   2249 /// \param __a
   2250 ///    A 64-bit integer vector containing one of the source operands.
   2251 /// \param __b
   2252 ///    A 64-bit integer vector containing one of the source operands.
   2253 /// \returns A 64-bit integer vector containing the products of both operands.
   2254 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2255 _mm_mulhi_pu16(__m64 __a, __m64 __b)
   2256 {
   2257   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
   2258 }
   2259 
   2260 /// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
   2261 ///    destination, as specified by the immediate value operand.
   2262 ///
   2263 /// \headerfile <x86intrin.h>
   2264 ///
   2265 /// This intrinsic corresponds to the \c PSHUFW instruction.
   2266 ///
   2267 /// \code
   2268 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
   2269 /// \endcode
   2270 ///
   2271 /// \param a
   2272 ///    A 64-bit integer vector containing the values to be shuffled.
   2273 /// \param n
   2274 ///    An immediate value containing an 8-bit value specifying which elements to
   2275 ///    copy from a. The destinations within the 64-bit destination are assigned
   2276 ///    values as follows:
   2277 ///    Bits [1:0] are used to assign values to bits [15:0] in the destination.
   2278 ///    Bits [3:2] are used to assign values to bits [31:16] in the destination.
   2279 ///    Bits [5:4] are used to assign values to bits [47:32] in the destination.
   2280 ///    Bits [7:6] are used to assign values to bits [63:48] in the destination.
   2281 ///    Bit value assignments:
   2282 ///    00: assigned from bits [15:0] of a.
   2283 ///    01: assigned from bits [31:16] of a.
   2284 ///    10: assigned from bits [47:32] of a.
   2285 ///    11: assigned from bits [63:48] of a.
   2286 /// \returns A 64-bit integer vector containing the shuffled values.
   2287 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
   2288   (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
   2289 
   2290 /// \brief Conditionally copies the values from each 8-bit element in the first
   2291 ///    64-bit integer vector operand to the specified memory location, as
   2292 ///    specified by the most significant bit in the corresponding element in the
   2293 ///    second 64-bit integer vector operand. To minimize caching, the data is
   2294 ///    flagged as non-temporal (unlikely to be used again soon).
   2295 ///
   2296 /// \headerfile <x86intrin.h>
   2297 ///
   2298 /// This intrinsic corresponds to the \c MASKMOVQ instruction.
   2299 ///
   2300 /// \param __d
   2301 ///    A 64-bit integer vector containing the values with elements to be copied.
   2302 /// \param __n
   2303 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
   2304 ///    element determines whether the corresponding element in operand __d is
   2305 ///    copied. If the most significant bit of a given element is 1, the
   2306 ///    corresponding element in operand __d is copied.
   2307 /// \param __p
   2308 ///    A pointer to a 64-bit memory location that will receive the conditionally
   2309 ///    copied integer values. The address of the memory location does not have
   2310 ///    to be aligned.
   2311 static __inline__ void __DEFAULT_FN_ATTRS
   2312 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
   2313 {
   2314   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
   2315 }
   2316 
   2317 /// \brief Computes the rounded averages of the packed unsigned 8-bit integer
   2318 ///    values and writes the averages to the corresponding bits in the
   2319 ///    destination.
   2320 ///
   2321 /// \headerfile <x86intrin.h>
   2322 ///
   2323 /// This intrinsic corresponds to the \c PAVGB instruction.
   2324 ///
   2325 /// \param __a
   2326 ///    A 64-bit integer vector containing one of the source operands.
   2327 /// \param __b
   2328 ///    A 64-bit integer vector containing one of the source operands.
   2329 /// \returns A 64-bit integer vector containing the averages of both operands.
   2330 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2331 _mm_avg_pu8(__m64 __a, __m64 __b)
   2332 {
   2333   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
   2334 }
   2335 
   2336 /// \brief Computes the rounded averages of the packed unsigned 16-bit integer
   2337 ///    values and writes the averages to the corresponding bits in the
   2338 ///    destination.
   2339 ///
   2340 /// \headerfile <x86intrin.h>
   2341 ///
   2342 /// This intrinsic corresponds to the \c PAVGW instruction.
   2343 ///
   2344 /// \param __a
   2345 ///    A 64-bit integer vector containing one of the source operands.
   2346 /// \param __b
   2347 ///    A 64-bit integer vector containing one of the source operands.
   2348 /// \returns A 64-bit integer vector containing the averages of both operands.
   2349 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2350 _mm_avg_pu16(__m64 __a, __m64 __b)
   2351 {
   2352   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
   2353 }
   2354 
   2355 /// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
   2356 ///    64-bit vector operands and computes the absolute value for each of the
   2357 ///    difference. Then sum of the 8 absolute differences is written to the
   2358 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
   2359 ///
   2360 /// \headerfile <x86intrin.h>
   2361 ///
   2362 /// This intrinsic corresponds to the \c PSADBW instruction.
   2363 ///
   2364 /// \param __a
   2365 ///    A 64-bit integer vector containing one of the source operands.
   2366 /// \param __b
   2367 ///    A 64-bit integer vector containing one of the source operands.
   2368 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
   2369 ///    sets of absolute differences between both operands. The upper bits are
   2370 ///    cleared.
   2371 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2372 _mm_sad_pu8(__m64 __a, __m64 __b)
   2373 {
   2374   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
   2375 }
   2376 
   2377 /// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
   2378 ///    integer value. There are several groups of macros associated with this
   2379 ///    intrinsic, including:
   2380 ///    * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
   2381 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
   2382 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
   2383 ///      _MM_GET_EXCEPTION_STATE().
   2384 ///    * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
   2385 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
   2386 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
   2387 ///    * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
   2388 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
   2389 ///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
   2390 ///    * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
   2391 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
   2392 ///    * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
   2393 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
   2394 ///      _MM_GET_DENORMALS_ZERO_MODE().
   2395 ///
   2396 ///    For example, the expression below checks if an overflow exception has
   2397 ///    occurred:
   2398 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
   2399 ///
   2400 ///    The following example gets the current rounding mode:
   2401 ///      _MM_GET_ROUNDING_MODE()
   2402 ///
   2403 /// \headerfile <x86intrin.h>
   2404 ///
   2405 /// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction.
   2406 ///
   2407 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
   2408 ///    register.
   2409 static __inline__ unsigned int __DEFAULT_FN_ATTRS
   2410 _mm_getcsr(void)
   2411 {
   2412   return __builtin_ia32_stmxcsr();
   2413 }
   2414 
   2415 /// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There
   2416 ///    are several groups of macros associated with this intrinsic, including:
   2417 ///    * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
   2418 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
   2419 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
   2420 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
   2421 ///    * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
   2422 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
   2423 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
   2424 ///      of these macros.
   2425 ///    * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
   2426 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
   2427 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
   2428 ///    * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
   2429 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
   2430 ///      one of these macros.
   2431 ///    * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
   2432 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
   2433 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
   2434 ///
   2435 ///    For example, the following expression causes subsequent floating-point
   2436 ///    operations to round up:
   2437 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
   2438 ///
   2439 ///    The following example sets the DAZ and FTZ flags:
   2440 ///      void setFlags() {
   2441 ///        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
   2442 ///        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
   2443 ///      }
   2444 ///
   2445 /// \headerfile <x86intrin.h>
   2446 ///
   2447 /// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction.
   2448 ///
   2449 /// \param __i
   2450 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
   2451 static __inline__ void __DEFAULT_FN_ATTRS
   2452 _mm_setcsr(unsigned int __i)
   2453 {
   2454   __builtin_ia32_ldmxcsr(__i);
   2455 }
   2456 
   2457 /// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
   2458 ///    specified by the immediate value operand.
   2459 ///
   2460 /// \headerfile <x86intrin.h>
   2461 ///
   2462 /// \code
   2463 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
   2464 /// \endcode
   2465 ///
   2466 /// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
   2467 ///
   2468 /// \param a
   2469 ///    A 128-bit vector of [4 x float].
   2470 /// \param b
   2471 ///    A 128-bit vector of [4 x float].
   2472 /// \param mask
   2473 ///    An immediate value containing an 8-bit value specifying which elements to
   2474 ///    copy from a and b.
   2475 ///    Bits [3:0] specify the values copied from operand a.
   2476 ///    Bits [7:4] specify the values copied from operand b. The destinations
   2477 ///    within the 128-bit destination are assigned values as follows:
   2478 ///    Bits [1:0] are used to assign values to bits [31:0] in the destination.
   2479 ///    Bits [3:2] are used to assign values to bits [63:32] in the destination.
   2480 ///    Bits [5:4] are used to assign values to bits [95:64] in the destination.
   2481 ///    Bits [7:6] are used to assign values to bits [127:96] in the destination.
   2482 ///    Bit value assignments:
   2483 ///    00: Bits [31:0] copied from the specified operand.
   2484 ///    01: Bits [63:32] copied from the specified operand.
   2485 ///    10: Bits [95:64] copied from the specified operand.
   2486 ///    11: Bits [127:96] copied from the specified operand.
   2487 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
   2488 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
   2489   (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
   2490                                   0 + (((mask) >> 0) & 0x3), \
   2491                                   0 + (((mask) >> 2) & 0x3), \
   2492                                   4 + (((mask) >> 4) & 0x3), \
   2493                                   4 + (((mask) >> 6) & 0x3)); })
   2494 
   2495 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
   2496 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x
   2497 ///    float].
   2498 ///
   2499 /// \headerfile <x86intrin.h>
   2500 ///
   2501 /// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction.
   2502 ///
   2503 /// \param __a
   2504 ///    A 128-bit vector of [4 x float].
   2505 ///    Bits [95:64] are written to bits [31:0] of the destination.
   2506 ///    Bits [127:96] are written to bits [95:64] of the destination.
   2507 /// \param __b
   2508 ///    A 128-bit vector of [4 x float].
   2509 ///    Bits [95:64] are written to bits [63:32] of the destination.
   2510 ///    Bits [127:96] are written to bits [127:96] of the destination.
   2511 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
   2512 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2513 _mm_unpackhi_ps(__m128 __a, __m128 __b)
   2514 {
   2515   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
   2516 }
   2517 
   2518 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
   2519 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x
   2520 ///    float].
   2521 ///
   2522 /// \headerfile <x86intrin.h>
   2523 ///
   2524 /// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction.
   2525 ///
   2526 /// \param __a
   2527 ///    A 128-bit vector of [4 x float].
   2528 ///    Bits [31:0] are written to bits [31:0] of the destination.
   2529 ///    Bits [63:32] are written to bits [95:64] of the destination.
   2530 /// \param __b
   2531 ///    A 128-bit vector of [4 x float].
   2532 ///    Bits [31:0] are written to bits [63:32] of the destination.
   2533 ///    Bits [63:32] are written to bits [127:96] of the destination.
   2534 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
   2535 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2536 _mm_unpacklo_ps(__m128 __a, __m128 __b)
   2537 {
   2538   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
   2539 }
   2540 
   2541 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
   2542 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
   2543 ///    96 bits are set to the upper 96 bits of the first parameter.
   2544 ///
   2545 /// \headerfile <x86intrin.h>
   2546 ///
   2547 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
   2548 ///
   2549 /// \param __a
   2550 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
   2551 ///    written to the upper 96 bits of the result.
   2552 /// \param __b
   2553 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
   2554 ///    written to the lower 32 bits of the result.
   2555 /// \returns A 128-bit floating-point vector of [4 x float].
   2556 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2557 _mm_move_ss(__m128 __a, __m128 __b)
   2558 {
   2559   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
   2560 }
   2561 
   2562 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
   2563 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
   2564 ///    64 bits are set to the upper 64 bits of the first parameter.
   2565 ///
   2566 /// \headerfile <x86intrin.h>
   2567 ///
   2568 /// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
   2569 ///
   2570 /// \param __a
   2571 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
   2572 ///    written to the upper 64 bits of the result.
   2573 /// \param __b
   2574 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
   2575 ///    written to the lower 64 bits of the result.
   2576 /// \returns A 128-bit floating-point vector of [4 x float].
   2577 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2578 _mm_movehl_ps(__m128 __a, __m128 __b)
   2579 {
   2580   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
   2581 }
   2582 
   2583 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
   2584 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
   2585 ///    64 bits are set to the lower 64 bits of the second parameter.
   2586 ///
   2587 /// \headerfile <x86intrin.h>
   2588 ///
   2589 /// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
   2590 ///
   2591 /// \param __a
   2592 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
   2593 ///    written to the lower 64 bits of the result.
   2594 /// \param __b
   2595 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
   2596 ///    written to the upper 64 bits of the result.
   2597 /// \returns A 128-bit floating-point vector of [4 x float].
   2598 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2599 _mm_movelh_ps(__m128 __a, __m128 __b)
   2600 {
   2601   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
   2602 }
   2603 
   2604 /// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
   2605 ///    float].
   2606 ///
   2607 /// \headerfile <x86intrin.h>
   2608 ///
   2609 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
   2610 ///
   2611 /// \param __a
   2612 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
   2613 ///    from the corresponding elements in this operand.
   2614 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2615 ///    values from the operand.
   2616 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2617 _mm_cvtpi16_ps(__m64 __a)
   2618 {
   2619   __m64 __b, __c;
   2620   __m128 __r;
   2621 
   2622   __b = _mm_setzero_si64();
   2623   __b = _mm_cmpgt_pi16(__b, __a);
   2624   __c = _mm_unpackhi_pi16(__a, __b);
   2625   __r = _mm_setzero_ps();
   2626   __r = _mm_cvtpi32_ps(__r, __c);
   2627   __r = _mm_movelh_ps(__r, __r);
   2628   __c = _mm_unpacklo_pi16(__a, __b);
   2629   __r = _mm_cvtpi32_ps(__r, __c);
   2630 
   2631   return __r;
   2632 }
   2633 
   2634 /// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
   2635 ///    128-bit vector of [4 x float].
   2636 ///
   2637 /// \headerfile <x86intrin.h>
   2638 ///
   2639 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
   2640 ///
   2641 /// \param __a
   2642 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
   2643 ///    destination are copied from the corresponding elements in this operand.
   2644 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2645 ///    values from the operand.
   2646 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2647 _mm_cvtpu16_ps(__m64 __a)
   2648 {
   2649   __m64 __b, __c;
   2650   __m128 __r;
   2651 
   2652   __b = _mm_setzero_si64();
   2653   __c = _mm_unpackhi_pi16(__a, __b);
   2654   __r = _mm_setzero_ps();
   2655   __r = _mm_cvtpi32_ps(__r, __c);
   2656   __r = _mm_movelh_ps(__r, __r);
   2657   __c = _mm_unpacklo_pi16(__a, __b);
   2658   __r = _mm_cvtpi32_ps(__r, __c);
   2659 
   2660   return __r;
   2661 }
   2662 
   2663 /// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
   2664 ///    into a 128-bit vector of [4 x float].
   2665 ///
   2666 /// \headerfile <x86intrin.h>
   2667 ///
   2668 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
   2669 ///
   2670 /// \param __a
   2671 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
   2672 ///    from the corresponding lower 4 elements in this operand.
   2673 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2674 ///    values from the operand.
   2675 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2676 _mm_cvtpi8_ps(__m64 __a)
   2677 {
   2678   __m64 __b;
   2679 
   2680   __b = _mm_setzero_si64();
   2681   __b = _mm_cmpgt_pi8(__b, __a);
   2682   __b = _mm_unpacklo_pi8(__a, __b);
   2683 
   2684   return _mm_cvtpi16_ps(__b);
   2685 }
   2686 
   2687 /// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
   2688 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
   2689 ///
   2690 /// \headerfile <x86intrin.h>
   2691 ///
   2692 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
   2693 ///
   2694 /// \param __a
   2695 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
   2696 ///    destination are copied from the corresponding lower 4 elements in this
   2697 ///    operand.
   2698 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2699 ///    values from the source operand.
   2700 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2701 _mm_cvtpu8_ps(__m64 __a)
   2702 {
   2703   __m64 __b;
   2704 
   2705   __b = _mm_setzero_si64();
   2706   __b = _mm_unpacklo_pi8(__a, __b);
   2707 
   2708   return _mm_cvtpi16_ps(__b);
   2709 }
   2710 
   2711 /// \brief Converts the two 32-bit signed integer values from each 64-bit vector
   2712 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
   2713 ///
   2714 /// \headerfile <x86intrin.h>
   2715 ///
   2716 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
   2717 ///
   2718 /// \param __a
   2719 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
   2720 ///    copied from the elements in this operand.
   2721 /// \param __b
   2722 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
   2723 ///    copied from the elements in this operand.
   2724 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
   2725 ///    copied and converted values from the first operand. The upper 64 bits
   2726 ///    contain the copied and converted values from the second operand.
   2727 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2728 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
   2729 {
   2730   __m128 __c;
   2731 
   2732   __c = _mm_setzero_ps();
   2733   __c = _mm_cvtpi32_ps(__c, __b);
   2734   __c = _mm_movelh_ps(__c, __c);
   2735 
   2736   return _mm_cvtpi32_ps(__c, __a);
   2737 }
   2738 
   2739 /// \brief Converts each single-precision floating-point element of a 128-bit
   2740 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
   2741 ///    packs the results into a 64-bit integer vector of [4 x i16]. If the
   2742 ///    floating-point element is NaN or infinity, or if the floating-point
   2743 ///    element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
   2744 ///    to 0x8000. Otherwise if the floating-point element is greater
   2745 ///    than 0x7FFF, it is converted to 0x7FFF.
   2746 ///
   2747 /// \headerfile <x86intrin.h>
   2748 ///
   2749 /// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
   2750 ///
   2751 /// \param __a
   2752 ///    A 128-bit floating-point vector of [4 x float].
   2753 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
   2754 ///    values.
   2755 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2756 _mm_cvtps_pi16(__m128 __a)
   2757 {
   2758   __m64 __b, __c;
   2759 
   2760   __b = _mm_cvtps_pi32(__a);
   2761   __a = _mm_movehl_ps(__a, __a);
   2762   __c = _mm_cvtps_pi32(__a);
   2763 
   2764   return _mm_packs_pi32(__b, __c);
   2765 }
   2766 
   2767 /// \brief Converts each single-precision floating-point element of a 128-bit
   2768 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
   2769 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
   2770 ///    [8 x i8]. The upper 32 bits of the vector are set to 0. If the
   2771 ///    floating-point element is NaN or infinity, or if the floating-point
   2772 ///    element is greater than 0x7FFFFFFF or less than -0x80, it is converted
   2773 ///    to 0x80. Otherwise if the floating-point element is greater
   2774 ///    than 0x7F, it is converted to 0x7F.
   2775 ///
   2776 /// \headerfile <x86intrin.h>
   2777 ///
   2778 /// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
   2779 ///
   2780 /// \param __a
   2781 ///    128-bit floating-point vector of [4 x float].
   2782 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
   2783 ///    converted values and the uppper 32 bits are set to zero.
   2784 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2785 _mm_cvtps_pi8(__m128 __a)
   2786 {
   2787   __m64 __b, __c;
   2788 
   2789   __b = _mm_cvtps_pi16(__a);
   2790   __c = _mm_setzero_si64();
   2791 
   2792   return _mm_packs_pi16(__b, __c);
   2793 }
   2794 
   2795 /// \brief Extracts the sign bits from each single-precision floating-point
   2796 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
   2797 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
   2798 ///    to zero.
   2799 ///
   2800 /// \headerfile <x86intrin.h>
   2801 ///
   2802 /// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction.
   2803 ///
   2804 /// \param __a
   2805 ///    A 128-bit floating-point vector of [4 x float].
   2806 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
   2807 ///    single-precision floating-point element of the parameter. Bits [31:4] are
   2808 ///    set to zero.
   2809 static __inline__ int __DEFAULT_FN_ATTRS
   2810 _mm_movemask_ps(__m128 __a)
   2811 {
   2812   return __builtin_ia32_movmskps((__v4sf)__a);
   2813 }
   2814 
   2815 
   2816 #define _MM_ALIGN16 __attribute__((aligned(16)))
   2817 
   2818 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
   2819 
   2820 #define _MM_EXCEPT_INVALID    (0x0001)
   2821 #define _MM_EXCEPT_DENORM     (0x0002)
   2822 #define _MM_EXCEPT_DIV_ZERO   (0x0004)
   2823 #define _MM_EXCEPT_OVERFLOW   (0x0008)
   2824 #define _MM_EXCEPT_UNDERFLOW  (0x0010)
   2825 #define _MM_EXCEPT_INEXACT    (0x0020)
   2826 #define _MM_EXCEPT_MASK       (0x003f)
   2827 
   2828 #define _MM_MASK_INVALID      (0x0080)
   2829 #define _MM_MASK_DENORM       (0x0100)
   2830 #define _MM_MASK_DIV_ZERO     (0x0200)
   2831 #define _MM_MASK_OVERFLOW     (0x0400)
   2832 #define _MM_MASK_UNDERFLOW    (0x0800)
   2833 #define _MM_MASK_INEXACT      (0x1000)
   2834 #define _MM_MASK_MASK         (0x1f80)
   2835 
   2836 #define _MM_ROUND_NEAREST     (0x0000)
   2837 #define _MM_ROUND_DOWN        (0x2000)
   2838 #define _MM_ROUND_UP          (0x4000)
   2839 #define _MM_ROUND_TOWARD_ZERO (0x6000)
   2840 #define _MM_ROUND_MASK        (0x6000)
   2841 
   2842 #define _MM_FLUSH_ZERO_MASK   (0x8000)
   2843 #define _MM_FLUSH_ZERO_ON     (0x8000)
   2844 #define _MM_FLUSH_ZERO_OFF    (0x0000)
   2845 
   2846 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
   2847 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
   2848 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
   2849 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
   2850 
   2851 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
   2852 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
   2853 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
   2854 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
   2855 
   2856 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
   2857 do { \
   2858   __m128 tmp3, tmp2, tmp1, tmp0; \
   2859   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
   2860   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
   2861   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
   2862   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
   2863   (row0) = _mm_movelh_ps(tmp0, tmp2); \
   2864   (row1) = _mm_movehl_ps(tmp2, tmp0); \
   2865   (row2) = _mm_movelh_ps(tmp1, tmp3); \
   2866   (row3) = _mm_movehl_ps(tmp3, tmp1); \
   2867 } while (0)
   2868 
   2869 /* Aliases for compatibility. */
   2870 #define _m_pextrw _mm_extract_pi16
   2871 #define _m_pinsrw _mm_insert_pi16
   2872 #define _m_pmaxsw _mm_max_pi16
   2873 #define _m_pmaxub _mm_max_pu8
   2874 #define _m_pminsw _mm_min_pi16
   2875 #define _m_pminub _mm_min_pu8
   2876 #define _m_pmovmskb _mm_movemask_pi8
   2877 #define _m_pmulhuw _mm_mulhi_pu16
   2878 #define _m_pshufw _mm_shuffle_pi16
   2879 #define _m_maskmovq _mm_maskmove_si64
   2880 #define _m_pavgb _mm_avg_pu8
   2881 #define _m_pavgw _mm_avg_pu16
   2882 #define _m_psadbw _mm_sad_pu8
   2883 #define _m_ _mm_
   2884 #define _m_ _mm_
   2885 
   2886 #undef __DEFAULT_FN_ATTRS
   2887 
   2888 /* Ugly hack for backwards-compatibility (compatible with gcc) */
   2889 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
   2890 #include <emmintrin.h>
   2891 #endif
   2892 
   2893 #endif /* __XMMINTRIN_H */
   2894