Home | History | Annotate | Download | only in include
      1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __XMMINTRIN_H
     25 #define __XMMINTRIN_H
     26 
     27 #include <mmintrin.h>
     28 
     29 typedef int __v4si __attribute__((__vector_size__(16)));
     30 typedef float __v4sf __attribute__((__vector_size__(16)));
     31 typedef float __m128 __attribute__((__vector_size__(16)));
     32 
     33 /* Unsigned types */
     34 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
     35 
     36 /* This header should only be included in a hosted environment as it depends on
     37  * a standard library to provide allocation routines. */
     38 #if __STDC_HOSTED__
     39 #include <mm_malloc.h>
     40 #endif
     41 
     42 /* Define the default attributes for the functions in this file. */
     43 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
     44 
     45 /// \brief Adds the 32-bit float values in the low-order bits of the operands.
     46 ///
     47 /// \headerfile <x86intrin.h>
     48 ///
     49 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
     50 ///
     51 /// \param __a
     52 ///    A 128-bit vector of [4 x float] containing one of the source operands.
     53 ///    The lower 32 bits of this operand are used in the calculation.
     54 /// \param __b
     55 ///    A 128-bit vector of [4 x float] containing one of the source operands.
     56 ///    The lower 32 bits of this operand are used in the calculation.
     57 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
     58 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
     59 ///    the upper 96 bits of the first source operand.
     60 static __inline__ __m128 __DEFAULT_FN_ATTRS
     61 _mm_add_ss(__m128 __a, __m128 __b)
     62 {
     63   __a[0] += __b[0];
     64   return __a;
     65 }
     66 
     67 /// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
     68 ///    the addition.
     69 ///
     70 /// \headerfile <x86intrin.h>
     71 ///
     72 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
     73 ///
     74 /// \param __a
     75 ///    A 128-bit vector of [4 x float] containing one of the source operands.
     76 /// \param __b
     77 ///    A 128-bit vector of [4 x float] containing one of the source operands.
     78 /// \returns A 128-bit vector of [4 x float] containing the sums of both
     79 ///    operands.
     80 static __inline__ __m128 __DEFAULT_FN_ATTRS
     81 _mm_add_ps(__m128 __a, __m128 __b)
     82 {
     83   return (__m128)((__v4sf)__a + (__v4sf)__b);
     84 }
     85 
     86 /// \brief Subtracts the 32-bit float value in the low-order bits of the second
     87 ///    operand from the corresponding value in the first operand.
     88 ///
     89 /// \headerfile <x86intrin.h>
     90 ///
     91 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
     92 ///
     93 /// \param __a
     94 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
     95 ///    of this operand are used in the calculation.
     96 /// \param __b
     97 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
     98 ///    bits of this operand are used in the calculation.
     99 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
    100 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
    101 ///    copied from the upper 96 bits of the first source operand.
    102 static __inline__ __m128 __DEFAULT_FN_ATTRS
    103 _mm_sub_ss(__m128 __a, __m128 __b)
    104 {
    105   __a[0] -= __b[0];
    106   return __a;
    107 }
    108 
    109 /// \brief Subtracts each of the values of the second operand from the first
    110 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
    111 ///    the results of the subtraction.
    112 ///
    113 /// \headerfile <x86intrin.h>
    114 ///
    115 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
    116 ///
    117 /// \param __a
    118 ///    A 128-bit vector of [4 x float] containing the minuend.
    119 /// \param __b
    120 ///    A 128-bit vector of [4 x float] containing the subtrahend.
    121 /// \returns A 128-bit vector of [4 x float] containing the differences between
    122 ///    both operands.
    123 static __inline__ __m128 __DEFAULT_FN_ATTRS
    124 _mm_sub_ps(__m128 __a, __m128 __b)
    125 {
    126   return (__m128)((__v4sf)__a - (__v4sf)__b);
    127 }
    128 
    129 /// \brief Multiplies two 32-bit float values in the low-order bits of the
    130 ///    operands.
    131 ///
    132 /// \headerfile <x86intrin.h>
    133 ///
    134 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
    135 ///
    136 /// \param __a
    137 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    138 ///    The lower 32 bits of this operand are used in the calculation.
    139 /// \param __b
    140 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    141 ///    The lower 32 bits of this operand are used in the calculation.
    142 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
    143 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
    144 ///    bits of the first source operand.
    145 static __inline__ __m128 __DEFAULT_FN_ATTRS
    146 _mm_mul_ss(__m128 __a, __m128 __b)
    147 {
    148   __a[0] *= __b[0];
    149   return __a;
    150 }
    151 
    152 /// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
    153 ///    results of the multiplication.
    154 ///
    155 /// \headerfile <x86intrin.h>
    156 ///
    157 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
    158 ///
    159 /// \param __a
    160 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    161 /// \param __b
    162 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    163 /// \returns A 128-bit vector of [4 x float] containing the products of both
    164 ///    operands.
    165 static __inline__ __m128 __DEFAULT_FN_ATTRS
    166 _mm_mul_ps(__m128 __a, __m128 __b)
    167 {
    168   return (__m128)((__v4sf)__a * (__v4sf)__b);
    169 }
    170 
    171 /// \brief Divides the value in the low-order 32 bits of the first operand by
    172 ///    the corresponding value in the second operand.
    173 ///
    174 /// \headerfile <x86intrin.h>
    175 ///
    176 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
    177 ///
    178 /// \param __a
    179 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
    180 ///    bits of this operand are used in the calculation.
    181 /// \param __b
    182 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
    183 ///    of this operand are used in the calculation.
    184 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
    185 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
    186 ///    upper 96 bits of the first source operand.
    187 static __inline__ __m128 __DEFAULT_FN_ATTRS
    188 _mm_div_ss(__m128 __a, __m128 __b)
    189 {
    190   __a[0] /= __b[0];
    191   return __a;
    192 }
    193 
    194 /// \brief Divides two 128-bit vectors of [4 x float].
    195 ///
    196 /// \headerfile <x86intrin.h>
    197 ///
    198 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
    199 ///
    200 /// \param __a
    201 ///    A 128-bit vector of [4 x float] containing the dividend.
    202 /// \param __b
    203 ///    A 128-bit vector of [4 x float] containing the divisor.
    204 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
    205 ///    operands.
    206 static __inline__ __m128 __DEFAULT_FN_ATTRS
    207 _mm_div_ps(__m128 __a, __m128 __b)
    208 {
    209   return (__m128)((__v4sf)__a / (__v4sf)__b);
    210 }
    211 
    212 /// \brief Calculates the square root of the value stored in the low-order bits
    213 ///    of a 128-bit vector of [4 x float].
    214 ///
    215 /// \headerfile <x86intrin.h>
    216 ///
    217 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
    218 ///
    219 /// \param __a
    220 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
    221 ///    used in the calculation.
    222 /// \returns A 128-bit vector of [4 x float] containing the square root of the
    223 ///    value in the low-order bits of the operand.
    224 static __inline__ __m128 __DEFAULT_FN_ATTRS
    225 _mm_sqrt_ss(__m128 __a)
    226 {
    227   __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
    228   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
    229 }
    230 
    231 /// \brief Calculates the square roots of the values stored in a 128-bit vector
    232 ///    of [4 x float].
    233 ///
    234 /// \headerfile <x86intrin.h>
    235 ///
    236 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
    237 ///
    238 /// \param __a
    239 ///    A 128-bit vector of [4 x float].
    240 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
    241 ///    values in the operand.
    242 static __inline__ __m128 __DEFAULT_FN_ATTRS
    243 _mm_sqrt_ps(__m128 __a)
    244 {
    245   return __builtin_ia32_sqrtps((__v4sf)__a);
    246 }
    247 
    248 /// \brief Calculates the approximate reciprocal of the value stored in the
    249 ///    low-order bits of a 128-bit vector of [4 x float].
    250 ///
    251 /// \headerfile <x86intrin.h>
    252 ///
    253 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
    254 ///
    255 /// \param __a
    256 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
    257 ///    used in the calculation.
    258 /// \returns A 128-bit vector of [4 x float] containing the approximate
    259 ///    reciprocal of the value in the low-order bits of the operand.
    260 static __inline__ __m128 __DEFAULT_FN_ATTRS
    261 _mm_rcp_ss(__m128 __a)
    262 {
    263   __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
    264   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
    265 }
    266 
    267 /// \brief Calculates the approximate reciprocals of the values stored in a
    268 ///    128-bit vector of [4 x float].
    269 ///
    270 /// \headerfile <x86intrin.h>
    271 ///
    272 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
    273 ///
    274 /// \param __a
    275 ///    A 128-bit vector of [4 x float].
    276 /// \returns A 128-bit vector of [4 x float] containing the approximate
    277 ///    reciprocals of the values in the operand.
    278 static __inline__ __m128 __DEFAULT_FN_ATTRS
    279 _mm_rcp_ps(__m128 __a)
    280 {
    281   return __builtin_ia32_rcpps((__v4sf)__a);
    282 }
    283 
    284 /// \brief Calculates the approximate reciprocal of the square root of the value
    285 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
    286 ///
    287 /// \headerfile <x86intrin.h>
    288 ///
    289 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
    290 ///
    291 /// \param __a
    292 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
    293 ///    used in the calculation.
    294 /// \returns A 128-bit vector of [4 x float] containing the approximate
    295 ///    reciprocal of the square root of the value in the low-order bits of the
    296 ///    operand.
    297 static __inline__ __m128 __DEFAULT_FN_ATTRS
    298 _mm_rsqrt_ss(__m128 __a)
    299 {
    300   __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
    301   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
    302 }
    303 
    304 /// \brief Calculates the approximate reciprocals of the square roots of the
    305 ///    values stored in a 128-bit vector of [4 x float].
    306 ///
    307 /// \headerfile <x86intrin.h>
    308 ///
    309 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
    310 ///
    311 /// \param __a
    312 ///    A 128-bit vector of [4 x float].
    313 /// \returns A 128-bit vector of [4 x float] containing the approximate
    314 ///    reciprocals of the square roots of the values in the operand.
    315 static __inline__ __m128 __DEFAULT_FN_ATTRS
    316 _mm_rsqrt_ps(__m128 __a)
    317 {
    318   return __builtin_ia32_rsqrtps((__v4sf)__a);
    319 }
    320 
    321 /// \brief Compares two 32-bit float values in the low-order bits of both
    322 ///    operands and returns the lesser value in the low-order bits of the
    323 ///    vector of [4 x float].
    324 ///
    325 /// \headerfile <x86intrin.h>
    326 ///
    327 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
    328 ///
    329 /// \param __a
    330 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    331 ///    32 bits of this operand are used in the comparison.
    332 /// \param __b
    333 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    334 ///    32 bits of this operand are used in the comparison.
    335 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
    336 ///    minimum value between both operands. The upper 96 bits are copied from
    337 ///    the upper 96 bits of the first source operand.
    338 static __inline__ __m128 __DEFAULT_FN_ATTRS
    339 _mm_min_ss(__m128 __a, __m128 __b)
    340 {
    341   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
    342 }
    343 
    344 /// \brief Compares two 128-bit vectors of [4 x float] and returns the lesser
    345 ///    of each pair of values.
    346 ///
    347 /// \headerfile <x86intrin.h>
    348 ///
    349 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
    350 ///
    351 /// \param __a
    352 ///    A 128-bit vector of [4 x float] containing one of the operands.
    353 /// \param __b
    354 ///    A 128-bit vector of [4 x float] containing one of the operands.
    355 /// \returns A 128-bit vector of [4 x float] containing the minimum values
    356 ///    between both operands.
    357 static __inline__ __m128 __DEFAULT_FN_ATTRS
    358 _mm_min_ps(__m128 __a, __m128 __b)
    359 {
    360   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
    361 }
    362 
    363 /// \brief Compares two 32-bit float values in the low-order bits of both
    364 ///    operands and returns the greater value in the low-order bits of a 128-bit
    365 ///    vector of [4 x float].
    366 ///
    367 /// \headerfile <x86intrin.h>
    368 ///
    369 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
    370 ///
    371 /// \param __a
    372 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    373 ///    32 bits of this operand are used in the comparison.
    374 /// \param __b
    375 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    376 ///    32 bits of this operand are used in the comparison.
    377 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
    378 ///    maximum value between both operands. The upper 96 bits are copied from
    379 ///    the upper 96 bits of the first source operand.
    380 static __inline__ __m128 __DEFAULT_FN_ATTRS
    381 _mm_max_ss(__m128 __a, __m128 __b)
    382 {
    383   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
    384 }
    385 
    386 /// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
    387 ///    of each pair of values.
    388 ///
    389 /// \headerfile <x86intrin.h>
    390 ///
    391 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
    392 ///
    393 /// \param __a
    394 ///    A 128-bit vector of [4 x float] containing one of the operands.
    395 /// \param __b
    396 ///    A 128-bit vector of [4 x float] containing one of the operands.
    397 /// \returns A 128-bit vector of [4 x float] containing the maximum values
    398 ///    between both operands.
    399 static __inline__ __m128 __DEFAULT_FN_ATTRS
    400 _mm_max_ps(__m128 __a, __m128 __b)
    401 {
    402   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
    403 }
    404 
    405 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
    406 ///
    407 /// \headerfile <x86intrin.h>
    408 ///
    409 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
    410 ///
    411 /// \param __a
    412 ///    A 128-bit vector containing one of the source operands.
    413 /// \param __b
    414 ///    A 128-bit vector containing one of the source operands.
    415 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
    416 ///    values between both operands.
    417 static __inline__ __m128 __DEFAULT_FN_ATTRS
    418 _mm_and_ps(__m128 __a, __m128 __b)
    419 {
    420   return (__m128)((__v4su)__a & (__v4su)__b);
    421 }
    422 
    423 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
    424 ///    the one's complement of the values contained in the first source
    425 ///    operand.
    426 ///
    427 /// \headerfile <x86intrin.h>
    428 ///
    429 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
    430 ///
    431 /// \param __a
    432 ///    A 128-bit vector of [4 x float] containing the first source operand. The
    433 ///    one's complement of this value is used in the bitwise AND.
    434 /// \param __b
    435 ///    A 128-bit vector of [4 x float] containing the second source operand.
    436 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
    437 ///    one's complement of the first operand and the values in the second
    438 ///    operand.
    439 static __inline__ __m128 __DEFAULT_FN_ATTRS
    440 _mm_andnot_ps(__m128 __a, __m128 __b)
    441 {
    442   return (__m128)(~(__v4su)__a & (__v4su)__b);
    443 }
    444 
    445 /// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
    446 ///
    447 /// \headerfile <x86intrin.h>
    448 ///
    449 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
    450 ///
    451 /// \param __a
    452 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    453 /// \param __b
    454 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    455 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
    456 ///    values between both operands.
    457 static __inline__ __m128 __DEFAULT_FN_ATTRS
    458 _mm_or_ps(__m128 __a, __m128 __b)
    459 {
    460   return (__m128)((__v4su)__a | (__v4su)__b);
    461 }
    462 
    463 /// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
    464 ///    [4 x float].
    465 ///
    466 /// \headerfile <x86intrin.h>
    467 ///
    468 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
    469 ///
    470 /// \param __a
    471 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    472 /// \param __b
    473 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    474 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
    475 ///    of the values between both operands.
    476 static __inline__ __m128 __DEFAULT_FN_ATTRS
    477 _mm_xor_ps(__m128 __a, __m128 __b)
    478 {
    479   return (__m128)((__v4su)__a ^ (__v4su)__b);
    480 }
    481 
    482 /// \brief Compares two 32-bit float values in the low-order bits of both
    483 ///    operands for equality and returns the result of the comparison in the
    484 ///    low-order bits of a vector [4 x float].
    485 ///
    486 /// \headerfile <x86intrin.h>
    487 ///
    488 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
    489 ///
    490 /// \param __a
    491 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    492 ///    32 bits of this operand are used in the comparison.
    493 /// \param __b
    494 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    495 ///    32 bits of this operand are used in the comparison.
    496 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    497 ///    in the low-order bits.
    498 static __inline__ __m128 __DEFAULT_FN_ATTRS
    499 _mm_cmpeq_ss(__m128 __a, __m128 __b)
    500 {
    501   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
    502 }
    503 
    504 /// \brief Compares each of the corresponding 32-bit float values of the
    505 ///    128-bit vectors of [4 x float] for equality.
    506 ///
    507 /// \headerfile <x86intrin.h>
    508 ///
    509 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
    510 ///
    511 /// \param __a
    512 ///    A 128-bit vector of [4 x float].
    513 /// \param __b
    514 ///    A 128-bit vector of [4 x float].
    515 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    516 static __inline__ __m128 __DEFAULT_FN_ATTRS
    517 _mm_cmpeq_ps(__m128 __a, __m128 __b)
    518 {
    519   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
    520 }
    521 
    522 /// \brief Compares two 32-bit float values in the low-order bits of both
    523 ///    operands to determine if the value in the first operand is less than the
    524 ///    corresponding value in the second operand and returns the result of the
    525 ///    comparison in the low-order bits of a vector of [4 x float].
    526 ///
    527 /// \headerfile <x86intrin.h>
    528 ///
    529 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
    530 ///
    531 /// \param __a
    532 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    533 ///    32 bits of this operand are used in the comparison.
    534 /// \param __b
    535 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    536 ///    32 bits of this operand are used in the comparison.
    537 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    538 ///    in the low-order bits.
    539 static __inline__ __m128 __DEFAULT_FN_ATTRS
    540 _mm_cmplt_ss(__m128 __a, __m128 __b)
    541 {
    542   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
    543 }
    544 
    545 /// \brief Compares each of the corresponding 32-bit float values of the
    546 ///    128-bit vectors of [4 x float] to determine if the values in the first
    547 ///    operand are less than those in the second operand.
    548 ///
    549 /// \headerfile <x86intrin.h>
    550 ///
    551 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
    552 ///
    553 /// \param __a
    554 ///    A 128-bit vector of [4 x float].
    555 /// \param __b
    556 ///    A 128-bit vector of [4 x float].
    557 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    558 static __inline__ __m128 __DEFAULT_FN_ATTRS
    559 _mm_cmplt_ps(__m128 __a, __m128 __b)
    560 {
    561   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
    562 }
    563 
    564 /// \brief Compares two 32-bit float values in the low-order bits of both
    565 ///    operands to determine if the value in the first operand is less than or
    566 ///    equal to the corresponding value in the second operand and returns the
    567 ///    result of the comparison in the low-order bits of a vector of
    568 ///    [4 x float].
    569 ///
    570 /// \headerfile <x86intrin.h>
    571 ///
    572 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
    573 ///
    574 /// \param __a
    575 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    576 ///    32 bits of this operand are used in the comparison.
    577 /// \param __b
    578 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    579 ///    32 bits of this operand are used in the comparison.
    580 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    581 ///    in the low-order bits.
    582 static __inline__ __m128 __DEFAULT_FN_ATTRS
    583 _mm_cmple_ss(__m128 __a, __m128 __b)
    584 {
    585   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
    586 }
    587 
    588 /// \brief Compares each of the corresponding 32-bit float values of the
    589 ///    128-bit vectors of [4 x float] to determine if the values in the first
    590 ///    operand are less than or equal to those in the second operand.
    591 ///
    592 /// \headerfile <x86intrin.h>
    593 ///
    594 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
    595 ///
    596 /// \param __a
    597 ///    A 128-bit vector of [4 x float].
    598 /// \param __b
    599 ///    A 128-bit vector of [4 x float].
    600 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    601 static __inline__ __m128 __DEFAULT_FN_ATTRS
    602 _mm_cmple_ps(__m128 __a, __m128 __b)
    603 {
    604   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
    605 }
    606 
    607 /// \brief Compares two 32-bit float values in the low-order bits of both
    608 ///    operands to determine if the value in the first operand is greater than
    609 ///    the corresponding value in the second operand and returns the result of
    610 ///    the comparison in the low-order bits of a vector of [4 x float].
    611 ///
    612 /// \headerfile <x86intrin.h>
    613 ///
    614 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
    615 ///
    616 /// \param __a
    617 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    618 ///    32 bits of this operand are used in the comparison.
    619 /// \param __b
    620 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    621 ///    32 bits of this operand are used in the comparison.
    622 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    623 ///    in the low-order bits.
    624 static __inline__ __m128 __DEFAULT_FN_ATTRS
    625 _mm_cmpgt_ss(__m128 __a, __m128 __b)
    626 {
    627   return (__m128)__builtin_shufflevector((__v4sf)__a,
    628                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
    629                                          4, 1, 2, 3);
    630 }
    631 
    632 /// \brief Compares each of the corresponding 32-bit float values of the
    633 ///    128-bit vectors of [4 x float] to determine if the values in the first
    634 ///    operand are greater than those in the second operand.
    635 ///
    636 /// \headerfile <x86intrin.h>
    637 ///
    638 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
    639 ///
    640 /// \param __a
    641 ///    A 128-bit vector of [4 x float].
    642 /// \param __b
    643 ///    A 128-bit vector of [4 x float].
    644 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    645 static __inline__ __m128 __DEFAULT_FN_ATTRS
    646 _mm_cmpgt_ps(__m128 __a, __m128 __b)
    647 {
    648   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
    649 }
    650 
    651 /// \brief Compares two 32-bit float values in the low-order bits of both
    652 ///    operands to determine if the value in the first operand is greater than
    653 ///    or equal to the corresponding value in the second operand and returns
    654 ///    the result of the comparison in the low-order bits of a vector of
    655 ///    [4 x float].
    656 ///
    657 /// \headerfile <x86intrin.h>
    658 ///
    659 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
    660 ///
    661 /// \param __a
    662 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    663 ///    32 bits of this operand are used in the comparison.
    664 /// \param __b
    665 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    666 ///    32 bits of this operand are used in the comparison.
    667 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    668 ///    in the low-order bits.
    669 static __inline__ __m128 __DEFAULT_FN_ATTRS
    670 _mm_cmpge_ss(__m128 __a, __m128 __b)
    671 {
    672   return (__m128)__builtin_shufflevector((__v4sf)__a,
    673                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
    674                                          4, 1, 2, 3);
    675 }
    676 
    677 /// \brief Compares each of the corresponding 32-bit float values of the
    678 ///    128-bit vectors of [4 x float] to determine if the values in the first
    679 ///    operand are greater than or equal to those in the second operand.
    680 ///
    681 /// \headerfile <x86intrin.h>
    682 ///
    683 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
    684 ///
    685 /// \param __a
    686 ///    A 128-bit vector of [4 x float].
    687 /// \param __b
    688 ///    A 128-bit vector of [4 x float].
    689 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    690 static __inline__ __m128 __DEFAULT_FN_ATTRS
    691 _mm_cmpge_ps(__m128 __a, __m128 __b)
    692 {
    693   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
    694 }
    695 
    696 /// \brief Compares two 32-bit float values in the low-order bits of both
    697 ///    operands for inequality and returns the result of the comparison in the
    698 ///    low-order bits of a vector of [4 x float].
    699 ///
    700 /// \headerfile <x86intrin.h>
    701 ///
    702 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
    703 ///   instructions.
    704 ///
    705 /// \param __a
    706 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    707 ///    32 bits of this operand are used in the comparison.
    708 /// \param __b
    709 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    710 ///    32 bits of this operand are used in the comparison.
    711 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    712 ///    in the low-order bits.
    713 static __inline__ __m128 __DEFAULT_FN_ATTRS
    714 _mm_cmpneq_ss(__m128 __a, __m128 __b)
    715 {
    716   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
    717 }
    718 
    719 /// \brief Compares each of the corresponding 32-bit float values of the
    720 ///    128-bit vectors of [4 x float] for inequality.
    721 ///
    722 /// \headerfile <x86intrin.h>
    723 ///
    724 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
    725 ///   instructions.
    726 ///
    727 /// \param __a
    728 ///    A 128-bit vector of [4 x float].
    729 /// \param __b
    730 ///    A 128-bit vector of [4 x float].
    731 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    732 static __inline__ __m128 __DEFAULT_FN_ATTRS
    733 _mm_cmpneq_ps(__m128 __a, __m128 __b)
    734 {
    735   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
    736 }
    737 
    738 /// \brief Compares two 32-bit float values in the low-order bits of both
    739 ///    operands to determine if the value in the first operand is not less than
    740 ///    the corresponding value in the second operand and returns the result of
    741 ///    the comparison in the low-order bits of a vector of [4 x float].
    742 ///
    743 /// \headerfile <x86intrin.h>
    744 ///
    745 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
    746 ///   instructions.
    747 ///
    748 /// \param __a
    749 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    750 ///    32 bits of this operand are used in the comparison.
    751 /// \param __b
    752 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    753 ///    32 bits of this operand are used in the comparison.
    754 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    755 ///    in the low-order bits.
    756 static __inline__ __m128 __DEFAULT_FN_ATTRS
    757 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
    758 {
    759   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
    760 }
    761 
    762 /// \brief Compares each of the corresponding 32-bit float values of the
    763 ///    128-bit vectors of [4 x float] to determine if the values in the first
    764 ///    operand are not less than those in the second operand.
    765 ///
    766 /// \headerfile <x86intrin.h>
    767 ///
    768 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
    769 ///   instructions.
    770 ///
    771 /// \param __a
    772 ///    A 128-bit vector of [4 x float].
    773 /// \param __b
    774 ///    A 128-bit vector of [4 x float].
    775 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    776 static __inline__ __m128 __DEFAULT_FN_ATTRS
    777 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
    778 {
    779   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
    780 }
    781 
    782 /// \brief Compares two 32-bit float values in the low-order bits of both
    783 ///    operands to determine if the value in the first operand is not less than
    784 ///    or equal to the corresponding value in the second operand and returns
    785 ///    the result of the comparison in the low-order bits of a vector of
    786 ///    [4 x float].
    787 ///
    788 /// \headerfile <x86intrin.h>
    789 ///
    790 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
    791 ///   instructions.
    792 ///
    793 /// \param __a
    794 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    795 ///    32 bits of this operand are used in the comparison.
    796 /// \param __b
    797 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    798 ///    32 bits of this operand are used in the comparison.
    799 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    800 ///    in the low-order bits.
    801 static __inline__ __m128 __DEFAULT_FN_ATTRS
    802 _mm_cmpnle_ss(__m128 __a, __m128 __b)
    803 {
    804   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
    805 }
    806 
    807 /// \brief Compares each of the corresponding 32-bit float values of the
    808 ///    128-bit vectors of [4 x float] to determine if the values in the first
    809 ///    operand are not less than or equal to those in the second operand.
    810 ///
    811 /// \headerfile <x86intrin.h>
    812 ///
    813 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
    814 ///   instructions.
    815 ///
    816 /// \param __a
    817 ///    A 128-bit vector of [4 x float].
    818 /// \param __b
    819 ///    A 128-bit vector of [4 x float].
    820 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    821 static __inline__ __m128 __DEFAULT_FN_ATTRS
    822 _mm_cmpnle_ps(__m128 __a, __m128 __b)
    823 {
    824   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
    825 }
    826 
    827 /// \brief Compares two 32-bit float values in the low-order bits of both
    828 ///    operands to determine if the value in the first operand is not greater
    829 ///    than the corresponding value in the second operand and returns the
    830 ///    result of the comparison in the low-order bits of a vector of
    831 ///    [4 x float].
    832 ///
    833 /// \headerfile <x86intrin.h>
    834 ///
    835 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
    836 ///   instructions.
    837 ///
    838 /// \param __a
    839 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    840 ///    32 bits of this operand are used in the comparison.
    841 /// \param __b
    842 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    843 ///    32 bits of this operand are used in the comparison.
    844 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    845 ///    in the low-order bits.
    846 static __inline__ __m128 __DEFAULT_FN_ATTRS
    847 _mm_cmpngt_ss(__m128 __a, __m128 __b)
    848 {
    849   return (__m128)__builtin_shufflevector((__v4sf)__a,
    850                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
    851                                          4, 1, 2, 3);
    852 }
    853 
    854 /// \brief Compares each of the corresponding 32-bit float values of the
    855 ///    128-bit vectors of [4 x float] to determine if the values in the first
    856 ///    operand are not greater than those in the second operand.
    857 ///
    858 /// \headerfile <x86intrin.h>
    859 ///
    860 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
    861 ///   instructions.
    862 ///
    863 /// \param __a
    864 ///    A 128-bit vector of [4 x float].
    865 /// \param __b
    866 ///    A 128-bit vector of [4 x float].
    867 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    868 static __inline__ __m128 __DEFAULT_FN_ATTRS
    869 _mm_cmpngt_ps(__m128 __a, __m128 __b)
    870 {
    871   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
    872 }
    873 
    874 /// \brief Compares two 32-bit float values in the low-order bits of both
    875 ///    operands to determine if the value in the first operand is not greater
    876 ///    than or equal to the corresponding value in the second operand and
    877 ///    returns the result of the comparison in the low-order bits of a vector
    878 ///    of [4 x float].
    879 ///
    880 /// \headerfile <x86intrin.h>
    881 ///
    882 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
    883 ///   instructions.
    884 ///
    885 /// \param __a
    886 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    887 ///    32 bits of this operand are used in the comparison.
    888 /// \param __b
    889 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    890 ///    32 bits of this operand are used in the comparison.
    891 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    892 ///    in the low-order bits.
    893 static __inline__ __m128 __DEFAULT_FN_ATTRS
    894 _mm_cmpnge_ss(__m128 __a, __m128 __b)
    895 {
    896   return (__m128)__builtin_shufflevector((__v4sf)__a,
    897                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
    898                                          4, 1, 2, 3);
    899 }
    900 
    901 /// \brief Compares each of the corresponding 32-bit float values of the
    902 ///    128-bit vectors of [4 x float] to determine if the values in the first
    903 ///    operand are not greater than or equal to those in the second operand.
    904 ///
    905 /// \headerfile <x86intrin.h>
    906 ///
    907 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
    908 ///   instructions.
    909 ///
    910 /// \param __a
    911 ///    A 128-bit vector of [4 x float].
    912 /// \param __b
    913 ///    A 128-bit vector of [4 x float].
    914 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    915 static __inline__ __m128 __DEFAULT_FN_ATTRS
    916 _mm_cmpnge_ps(__m128 __a, __m128 __b)
    917 {
    918   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
    919 }
    920 
    921 /// \brief Compares two 32-bit float values in the low-order bits of both
    922 ///    operands to determine if the value in the first operand is ordered with
    923 ///    respect to the corresponding value in the second operand and returns the
    924 ///    result of the comparison in the low-order bits of a vector of
    925 ///    [4 x float].
    926 ///
    927 /// \headerfile <x86intrin.h>
    928 ///
    929 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
    930 ///   instructions.
    931 ///
    932 /// \param __a
    933 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    934 ///    32 bits of this operand are used in the comparison.
    935 /// \param __b
    936 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    937 ///    32 bits of this operand are used in the comparison.
    938 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    939 ///    in the low-order bits.
    940 static __inline__ __m128 __DEFAULT_FN_ATTRS
    941 _mm_cmpord_ss(__m128 __a, __m128 __b)
    942 {
    943   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
    944 }
    945 
    946 /// \brief Compares each of the corresponding 32-bit float values of the
    947 ///    128-bit vectors of [4 x float] to determine if the values in the first
    948 ///    operand are ordered with respect to those in the second operand.
    949 ///
    950 /// \headerfile <x86intrin.h>
    951 ///
    952 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
    953 ///   instructions.
    954 ///
    955 /// \param __a
    956 ///    A 128-bit vector of [4 x float].
    957 /// \param __b
    958 ///    A 128-bit vector of [4 x float].
    959 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    960 static __inline__ __m128 __DEFAULT_FN_ATTRS
    961 _mm_cmpord_ps(__m128 __a, __m128 __b)
    962 {
    963   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
    964 }
    965 
    966 /// \brief Compares two 32-bit float values in the low-order bits of both
    967 ///    operands to determine if the value in the first operand is unordered
    968 ///    with respect to the corresponding value in the second operand and
    969 ///    returns the result of the comparison in the low-order bits of a vector
    970 ///    of [4 x float].
    971 ///
    972 /// \headerfile <x86intrin.h>
    973 ///
    974 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
    975 ///   instructions.
    976 ///
    977 /// \param __a
    978 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    979 ///    32 bits of this operand are used in the comparison.
    980 /// \param __b
    981 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    982 ///    32 bits of this operand are used in the comparison.
    983 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    984 ///    in the low-order bits.
    985 static __inline__ __m128 __DEFAULT_FN_ATTRS
    986 _mm_cmpunord_ss(__m128 __a, __m128 __b)
    987 {
    988   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
    989 }
    990 
    991 /// \brief Compares each of the corresponding 32-bit float values of the
    992 ///    128-bit vectors of [4 x float] to determine if the values in the first
    993 ///    operand are unordered with respect to those in the second operand.
    994 ///
    995 /// \headerfile <x86intrin.h>
    996 ///
    997 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
    998 ///   instructions.
    999 ///
   1000 /// \param __a
   1001 ///    A 128-bit vector of [4 x float].
   1002 /// \param __b
   1003 ///    A 128-bit vector of [4 x float].
   1004 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   1005 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1006 _mm_cmpunord_ps(__m128 __a, __m128 __b)
   1007 {
   1008   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
   1009 }
   1010 
   1011 /// \brief Compares two 32-bit float values in the low-order bits of both
   1012 ///    operands for equality and returns the result of the comparison.
   1013 ///
   1014 /// \headerfile <x86intrin.h>
   1015 ///
   1016 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
   1017 ///   instructions.
   1018 ///
   1019 /// \param __a
   1020 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1021 ///    used in the comparison.
   1022 /// \param __b
   1023 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1024 ///    used in the comparison.
   1025 /// \returns An integer containing the comparison results.
   1026 static __inline__ int __DEFAULT_FN_ATTRS
   1027 _mm_comieq_ss(__m128 __a, __m128 __b)
   1028 {
   1029   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
   1030 }
   1031 
   1032 /// \brief Compares two 32-bit float values in the low-order bits of both
   1033 ///    operands to determine if the first operand is less than the second
   1034 ///    operand and returns the result of the comparison.
   1035 ///
   1036 /// \headerfile <x86intrin.h>
   1037 ///
   1038 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
   1039 ///   instructions.
   1040 ///
   1041 /// \param __a
   1042 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1043 ///    used in the comparison.
   1044 /// \param __b
   1045 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1046 ///    used in the comparison.
   1047 /// \returns An integer containing the comparison results.
   1048 static __inline__ int __DEFAULT_FN_ATTRS
   1049 _mm_comilt_ss(__m128 __a, __m128 __b)
   1050 {
   1051   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
   1052 }
   1053 
   1054 /// \brief Compares two 32-bit float values in the low-order bits of both
   1055 ///    operands to determine if the first operand is less than or equal to the
   1056 ///    second operand and returns the result of the comparison.
   1057 ///
   1058 /// \headerfile <x86intrin.h>
   1059 ///
   1060 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
   1061 ///
   1062 /// \param __a
   1063 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1064 ///    used in the comparison.
   1065 /// \param __b
   1066 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1067 ///    used in the comparison.
   1068 /// \returns An integer containing the comparison results.
   1069 static __inline__ int __DEFAULT_FN_ATTRS
   1070 _mm_comile_ss(__m128 __a, __m128 __b)
   1071 {
   1072   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
   1073 }
   1074 
   1075 /// \brief Compares two 32-bit float values in the low-order bits of both
   1076 ///    operands to determine if the first operand is greater than the second
   1077 ///    operand and returns the result of the comparison.
   1078 ///
   1079 /// \headerfile <x86intrin.h>
   1080 ///
   1081 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
   1082 ///
   1083 /// \param __a
   1084 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1085 ///    used in the comparison.
   1086 /// \param __b
   1087 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1088 ///    used in the comparison.
   1089 /// \returns An integer containing the comparison results.
   1090 static __inline__ int __DEFAULT_FN_ATTRS
   1091 _mm_comigt_ss(__m128 __a, __m128 __b)
   1092 {
   1093   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
   1094 }
   1095 
   1096 /// \brief Compares two 32-bit float values in the low-order bits of both
   1097 ///    operands to determine if the first operand is greater than or equal to
   1098 ///    the second operand and returns the result of the comparison.
   1099 ///
   1100 /// \headerfile <x86intrin.h>
   1101 ///
   1102 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
   1103 ///
   1104 /// \param __a
   1105 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1106 ///    used in the comparison.
   1107 /// \param __b
   1108 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1109 ///    used in the comparison.
   1110 /// \returns An integer containing the comparison results.
   1111 static __inline__ int __DEFAULT_FN_ATTRS
   1112 _mm_comige_ss(__m128 __a, __m128 __b)
   1113 {
   1114   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
   1115 }
   1116 
   1117 /// \brief Compares two 32-bit float values in the low-order bits of both
   1118 ///    operands to determine if the first operand is not equal to the second
   1119 ///    operand and returns the result of the comparison.
   1120 ///
   1121 /// \headerfile <x86intrin.h>
   1122 ///
   1123 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
   1124 ///
   1125 /// \param __a
   1126 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1127 ///    used in the comparison.
   1128 /// \param __b
   1129 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1130 ///    used in the comparison.
   1131 /// \returns An integer containing the comparison results.
   1132 static __inline__ int __DEFAULT_FN_ATTRS
   1133 _mm_comineq_ss(__m128 __a, __m128 __b)
   1134 {
   1135   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
   1136 }
   1137 
   1138 /// \brief Performs an unordered comparison of two 32-bit float values using
   1139 ///    the low-order bits of both operands to determine equality and returns
   1140 ///    the result of the comparison.
   1141 ///
   1142 /// \headerfile <x86intrin.h>
   1143 ///
   1144 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1145 ///
   1146 /// \param __a
   1147 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1148 ///    used in the comparison.
   1149 /// \param __b
   1150 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1151 ///    used in the comparison.
   1152 /// \returns An integer containing the comparison results.
   1153 static __inline__ int __DEFAULT_FN_ATTRS
   1154 _mm_ucomieq_ss(__m128 __a, __m128 __b)
   1155 {
   1156   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
   1157 }
   1158 
   1159 /// \brief Performs an unordered comparison of two 32-bit float values using
   1160 ///    the low-order bits of both operands to determine if the first operand is
   1161 ///    less than the second operand and returns the result of the comparison.
   1162 ///
   1163 /// \headerfile <x86intrin.h>
   1164 ///
   1165 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1166 ///
   1167 /// \param __a
   1168 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1169 ///    used in the comparison.
   1170 /// \param __b
   1171 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1172 ///    used in the comparison.
   1173 /// \returns An integer containing the comparison results.
   1174 static __inline__ int __DEFAULT_FN_ATTRS
   1175 _mm_ucomilt_ss(__m128 __a, __m128 __b)
   1176 {
   1177   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
   1178 }
   1179 
   1180 /// \brief Performs an unordered comparison of two 32-bit float values using
   1181 ///    the low-order bits of both operands to determine if the first operand is
   1182 ///    less than or equal to the second operand and returns the result of the
   1183 ///    comparison.
   1184 ///
   1185 /// \headerfile <x86intrin.h>
   1186 ///
   1187 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1188 ///
   1189 /// \param __a
   1190 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1191 ///    used in the comparison.
   1192 /// \param __b
   1193 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1194 ///    used in the comparison.
   1195 /// \returns An integer containing the comparison results.
   1196 static __inline__ int __DEFAULT_FN_ATTRS
   1197 _mm_ucomile_ss(__m128 __a, __m128 __b)
   1198 {
   1199   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
   1200 }
   1201 
   1202 /// \brief Performs an unordered comparison of two 32-bit float values using
   1203 ///    the low-order bits of both operands to determine if the first operand is
   1204 ///    greater than the second operand and returns the result of the
   1205 ///    comparison.
   1206 ///
   1207 /// \headerfile <x86intrin.h>
   1208 ///
   1209 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1210 ///
   1211 /// \param __a
   1212 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1213 ///    used in the comparison.
   1214 /// \param __b
   1215 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1216 ///    used in the comparison.
   1217 /// \returns An integer containing the comparison results.
   1218 static __inline__ int __DEFAULT_FN_ATTRS
   1219 _mm_ucomigt_ss(__m128 __a, __m128 __b)
   1220 {
   1221   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
   1222 }
   1223 
   1224 /// \brief Performs an unordered comparison of two 32-bit float values using
   1225 ///    the low-order bits of both operands to determine if the first operand is
   1226 ///    greater than or equal to the second operand and returns the result of
   1227 ///    the comparison.
   1228 ///
   1229 /// \headerfile <x86intrin.h>
   1230 ///
   1231 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1232 ///
   1233 /// \param __a
   1234 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1235 ///    used in the comparison.
   1236 /// \param __b
   1237 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1238 ///    used in the comparison.
   1239 /// \returns An integer containing the comparison results.
   1240 static __inline__ int __DEFAULT_FN_ATTRS
   1241 _mm_ucomige_ss(__m128 __a, __m128 __b)
   1242 {
   1243   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
   1244 }
   1245 
   1246 /// \brief Performs an unordered comparison of two 32-bit float values using
   1247 ///    the low-order bits of both operands to determine inequality and returns
   1248 ///    the result of the comparison.
   1249 ///
   1250 /// \headerfile <x86intrin.h>
   1251 ///
   1252 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1253 ///
   1254 /// \param __a
   1255 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1256 ///    used in the comparison.
   1257 /// \param __b
   1258 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1259 ///    used in the comparison.
   1260 /// \returns An integer containing the comparison results.
   1261 static __inline__ int __DEFAULT_FN_ATTRS
   1262 _mm_ucomineq_ss(__m128 __a, __m128 __b)
   1263 {
   1264   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
   1265 }
   1266 
   1267 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1268 ///    [4 x float] into a 32-bit integer.
   1269 ///
   1270 /// \headerfile <x86intrin.h>
   1271 ///
   1272 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
   1273 ///   instructions.
   1274 ///
   1275 /// \param __a
   1276 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1277 ///    used in the conversion.
   1278 /// \returns A 32-bit integer containing the converted value.
   1279 static __inline__ int __DEFAULT_FN_ATTRS
   1280 _mm_cvtss_si32(__m128 __a)
   1281 {
   1282   return __builtin_ia32_cvtss2si((__v4sf)__a);
   1283 }
   1284 
   1285 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1286 ///    [4 x float] into a 32-bit integer.
   1287 ///
   1288 /// \headerfile <x86intrin.h>
   1289 ///
   1290 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
   1291 ///   instructions.
   1292 ///
   1293 /// \param __a
   1294 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1295 ///    used in the conversion.
   1296 /// \returns A 32-bit integer containing the converted value.
   1297 static __inline__ int __DEFAULT_FN_ATTRS
   1298 _mm_cvt_ss2si(__m128 __a)
   1299 {
   1300   return _mm_cvtss_si32(__a);
   1301 }
   1302 
   1303 #ifdef __x86_64__
   1304 
   1305 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1306 ///    [4 x float] into a 64-bit integer.
   1307 ///
   1308 /// \headerfile <x86intrin.h>
   1309 ///
   1310 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
   1311 ///   instructions.
   1312 ///
   1313 /// \param __a
   1314 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1315 ///    used in the conversion.
   1316 /// \returns A 64-bit integer containing the converted value.
   1317 static __inline__ long long __DEFAULT_FN_ATTRS
   1318 _mm_cvtss_si64(__m128 __a)
   1319 {
   1320   return __builtin_ia32_cvtss2si64((__v4sf)__a);
   1321 }
   1322 
   1323 #endif
   1324 
   1325 /// \brief Converts two low-order float values in a 128-bit vector of
   1326 ///    [4 x float] into a 64-bit vector of [2 x i32].
   1327 ///
   1328 /// \headerfile <x86intrin.h>
   1329 ///
   1330 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
   1331 ///
   1332 /// \param __a
   1333 ///    A 128-bit vector of [4 x float].
   1334 /// \returns A 64-bit integer vector containing the converted values.
   1335 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1336 _mm_cvtps_pi32(__m128 __a)
   1337 {
   1338   return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
   1339 }
   1340 
   1341 /// \brief Converts two low-order float values in a 128-bit vector of
   1342 ///    [4 x float] into a 64-bit vector of [2 x i32].
   1343 ///
   1344 /// \headerfile <x86intrin.h>
   1345 ///
   1346 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
   1347 ///
   1348 /// \param __a
   1349 ///    A 128-bit vector of [4 x float].
   1350 /// \returns A 64-bit integer vector containing the converted values.
   1351 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1352 _mm_cvt_ps2pi(__m128 __a)
   1353 {
   1354   return _mm_cvtps_pi32(__a);
   1355 }
   1356 
   1357 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1358 ///    [4 x float] into a 32-bit integer, truncating the result when it is
   1359 ///    inexact.
   1360 ///
   1361 /// \headerfile <x86intrin.h>
   1362 ///
   1363 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
   1364 ///   instructions.
   1365 ///
   1366 /// \param __a
   1367 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1368 ///    used in the conversion.
   1369 /// \returns A 32-bit integer containing the converted value.
   1370 static __inline__ int __DEFAULT_FN_ATTRS
   1371 _mm_cvttss_si32(__m128 __a)
   1372 {
   1373   return __builtin_ia32_cvttss2si((__v4sf)__a);
   1374 }
   1375 
   1376 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1377 ///    [4 x float] into a 32-bit integer, truncating the result when it is
   1378 ///    inexact.
   1379 ///
   1380 /// \headerfile <x86intrin.h>
   1381 ///
   1382 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
   1383 ///   instructions.
   1384 ///
   1385 /// \param __a
   1386 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1387 ///    used in the conversion.
   1388 /// \returns A 32-bit integer containing the converted value.
   1389 static __inline__ int __DEFAULT_FN_ATTRS
   1390 _mm_cvtt_ss2si(__m128 __a)
   1391 {
   1392   return _mm_cvttss_si32(__a);
   1393 }
   1394 
   1395 #ifdef __x86_64__
   1396 /// \brief Converts a float value contained in the lower 32 bits of a vector of
   1397 ///    [4 x float] into a 64-bit integer, truncating the result when it is
   1398 ///    inexact.
   1399 ///
   1400 /// \headerfile <x86intrin.h>
   1401 ///
   1402 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
   1403 ///   instructions.
   1404 ///
   1405 /// \param __a
   1406 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1407 ///    used in the conversion.
   1408 /// \returns A 64-bit integer containing the converted value.
   1409 static __inline__ long long __DEFAULT_FN_ATTRS
   1410 _mm_cvttss_si64(__m128 __a)
   1411 {
   1412   return __builtin_ia32_cvttss2si64((__v4sf)__a);
   1413 }
   1414 #endif
   1415 
   1416 /// \brief Converts two low-order float values in a 128-bit vector of
   1417 ///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
   1418 ///    when it is inexact.
   1419 ///
   1420 /// \headerfile <x86intrin.h>
   1421 ///
   1422 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
   1423 ///   instructions.
   1424 ///
   1425 /// \param __a
   1426 ///    A 128-bit vector of [4 x float].
   1427 /// \returns A 64-bit integer vector containing the converted values.
   1428 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1429 _mm_cvttps_pi32(__m128 __a)
   1430 {
   1431   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
   1432 }
   1433 
   1434 /// \brief Converts two low-order float values in a 128-bit vector of [4 x
   1435 ///    float] into a 64-bit vector of [2 x i32], truncating the result when it
   1436 ///    is inexact.
   1437 ///
   1438 /// \headerfile <x86intrin.h>
   1439 ///
   1440 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
   1441 ///
   1442 /// \param __a
   1443 ///    A 128-bit vector of [4 x float].
   1444 /// \returns A 64-bit integer vector containing the converted values.
   1445 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1446 _mm_cvtt_ps2pi(__m128 __a)
   1447 {
   1448   return _mm_cvttps_pi32(__a);
   1449 }
   1450 
   1451 /// \brief Converts a 32-bit signed integer value into a floating point value
   1452 ///    and writes it to the lower 32 bits of the destination. The remaining
   1453 ///    higher order elements of the destination vector are copied from the
   1454 ///    corresponding elements in the first operand.
   1455 ///
   1456 /// \headerfile <x86intrin.h>
   1457 ///
   1458 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
   1459 ///
   1460 /// \param __a
   1461 ///    A 128-bit vector of [4 x float].
   1462 /// \param __b
   1463 ///    A 32-bit signed integer operand containing the value to be converted.
   1464 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
   1465 ///    converted value of the second operand. The upper 96 bits are copied from
   1466 ///    the upper 96 bits of the first operand.
   1467 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1468 _mm_cvtsi32_ss(__m128 __a, int __b)
   1469 {
   1470   __a[0] = __b;
   1471   return __a;
   1472 }
   1473 
   1474 /// \brief Converts a 32-bit signed integer value into a floating point value
   1475 ///    and writes it to the lower 32 bits of the destination. The remaining
   1476 ///    higher order elements of the destination are copied from the
   1477 ///    corresponding elements in the first operand.
   1478 ///
   1479 /// \headerfile <x86intrin.h>
   1480 ///
   1481 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
   1482 ///
   1483 /// \param __a
   1484 ///    A 128-bit vector of [4 x float].
   1485 /// \param __b
   1486 ///    A 32-bit signed integer operand containing the value to be converted.
   1487 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
   1488 ///    converted value of the second operand. The upper 96 bits are copied from
   1489 ///    the upper 96 bits of the first operand.
   1490 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1491 _mm_cvt_si2ss(__m128 __a, int __b)
   1492 {
   1493   return _mm_cvtsi32_ss(__a, __b);
   1494 }
   1495 
   1496 #ifdef __x86_64__
   1497 
   1498 /// \brief Converts a 64-bit signed integer value into a floating point value
   1499 ///    and writes it to the lower 32 bits of the destination. The remaining
   1500 ///    higher order elements of the destination are copied from the
   1501 ///    corresponding elements in the first operand.
   1502 ///
   1503 /// \headerfile <x86intrin.h>
   1504 ///
   1505 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
   1506 ///
   1507 /// \param __a
   1508 ///    A 128-bit vector of [4 x float].
   1509 /// \param __b
   1510 ///    A 64-bit signed integer operand containing the value to be converted.
   1511 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
   1512 ///    converted value of the second operand. The upper 96 bits are copied from
   1513 ///    the upper 96 bits of the first operand.
   1514 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1515 _mm_cvtsi64_ss(__m128 __a, long long __b)
   1516 {
   1517   __a[0] = __b;
   1518   return __a;
   1519 }
   1520 
   1521 #endif
   1522 
   1523 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
   1524 ///    floating point values and writes them to the lower 64-bits of the
   1525 ///    destination. The remaining higher order elements of the destination are
   1526 ///    copied from the corresponding elements in the first operand.
   1527 ///
   1528 /// \headerfile <x86intrin.h>
   1529 ///
   1530 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
   1531 ///
   1532 /// \param __a
   1533 ///    A 128-bit vector of [4 x float].
   1534 /// \param __b
   1535 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
   1536 ///    and written to the corresponding low-order elements in the destination.
   1537 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
   1538 ///    converted value of the second operand. The upper 64 bits are copied from
   1539 ///    the upper 64 bits of the first operand.
   1540 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1541 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
   1542 {
   1543   return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
   1544 }
   1545 
   1546 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
   1547 ///    floating point values and writes them to the lower 64-bits of the
   1548 ///    destination. The remaining higher order elements of the destination are
   1549 ///    copied from the corresponding elements in the first operand.
   1550 ///
   1551 /// \headerfile <x86intrin.h>
   1552 ///
   1553 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
   1554 ///
   1555 /// \param __a
   1556 ///    A 128-bit vector of [4 x float].
   1557 /// \param __b
   1558 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
   1559 ///    and written to the corresponding low-order elements in the destination.
   1560 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
   1561 ///    converted value from the second operand. The upper 64 bits are copied
   1562 ///    from the upper 64 bits of the first operand.
   1563 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1564 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
   1565 {
   1566   return _mm_cvtpi32_ps(__a, __b);
   1567 }
   1568 
   1569 /// \brief Extracts a float value contained in the lower 32 bits of a vector of
   1570 ///    [4 x float].
   1571 ///
   1572 /// \headerfile <x86intrin.h>
   1573 ///
   1574 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
   1575 ///
   1576 /// \param __a
   1577 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1578 ///    used in the extraction.
   1579 /// \returns A 32-bit float containing the extracted value.
   1580 static __inline__ float __DEFAULT_FN_ATTRS
   1581 _mm_cvtss_f32(__m128 __a)
   1582 {
   1583   return __a[0];
   1584 }
   1585 
   1586 /// \brief Loads two packed float values from the address \a __p into the
   1587 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
   1588 ///     are copied from the low-order bits of the first operand.
   1589 ///
   1590 /// \headerfile <x86intrin.h>
   1591 ///
   1592 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
   1593 ///
   1594 /// \param __a
   1595 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
   1596 ///    of the destination.
   1597 /// \param __p
   1598 ///    A pointer to two packed float values. Bits [63:0] are written to bits
   1599 ///    [127:64] of the destination.
   1600 /// \returns A 128-bit vector of [4 x float] containing the moved values.
   1601 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1602 _mm_loadh_pi(__m128 __a, const __m64 *__p)
   1603 {
   1604   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
   1605   struct __mm_loadh_pi_struct {
   1606     __mm_loadh_pi_v2f32 __u;
   1607   } __attribute__((__packed__, __may_alias__));
   1608   __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
   1609   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
   1610   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
   1611 }
   1612 
   1613 /// \brief Loads two packed float values from the address \a __p into the
   1614 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
   1615 ///    are copied from the high-order bits of the first operand.
   1616 ///
   1617 /// \headerfile <x86intrin.h>
   1618 ///
   1619 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
   1620 ///
   1621 /// \param __a
   1622 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
   1623 ///    [127:64] of the destination.
   1624 /// \param __p
   1625 ///    A pointer to two packed float values. Bits [63:0] are written to bits
   1626 ///    [63:0] of the destination.
   1627 /// \returns A 128-bit vector of [4 x float] containing the moved values.
   1628 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1629 _mm_loadl_pi(__m128 __a, const __m64 *__p)
   1630 {
   1631   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
   1632   struct __mm_loadl_pi_struct {
   1633     __mm_loadl_pi_v2f32 __u;
   1634   } __attribute__((__packed__, __may_alias__));
   1635   __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
   1636   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
   1637   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
   1638 }
   1639 
   1640 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
   1641 ///    32 bits of the vector are initialized with the single-precision
   1642 ///    floating-point value loaded from a specified memory location. The upper
   1643 ///    96 bits are set to zero.
   1644 ///
   1645 /// \headerfile <x86intrin.h>
   1646 ///
   1647 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
   1648 ///
   1649 /// \param __p
   1650 ///    A pointer to a 32-bit memory location containing a single-precision
   1651 ///    floating-point value.
   1652 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
   1653 ///    lower 32 bits contain the value loaded from the memory location. The
   1654 ///    upper 96 bits are set to zero.
   1655 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1656 _mm_load_ss(const float *__p)
   1657 {
   1658   struct __mm_load_ss_struct {
   1659     float __u;
   1660   } __attribute__((__packed__, __may_alias__));
   1661   float __u = ((struct __mm_load_ss_struct*)__p)->__u;
   1662   return (__m128){ __u, 0, 0, 0 };
   1663 }
   1664 
   1665 /// \brief Loads a 32-bit float value and duplicates it to all four vector
   1666 ///    elements of a 128-bit vector of [4 x float].
   1667 ///
   1668 /// \headerfile <x86intrin.h>
   1669 ///
   1670 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS + shuffling </c>
   1671 ///    instruction.
   1672 ///
   1673 /// \param __p
   1674 ///    A pointer to a float value to be loaded and duplicated.
   1675 /// \returns A 128-bit vector of [4 x float] containing the loaded and
   1676 ///    duplicated values.
   1677 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1678 _mm_load1_ps(const float *__p)
   1679 {
   1680   struct __mm_load1_ps_struct {
   1681     float __u;
   1682   } __attribute__((__packed__, __may_alias__));
   1683   float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
   1684   return (__m128){ __u, __u, __u, __u };
   1685 }
   1686 
   1687 #define        _mm_load_ps1(p) _mm_load1_ps(p)
   1688 
   1689 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
   1690 ///    memory location.
   1691 ///
   1692 /// \headerfile <x86intrin.h>
   1693 ///
   1694 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
   1695 ///
   1696 /// \param __p
   1697 ///    A pointer to a 128-bit memory location. The address of the memory
   1698 ///    location has to be 128-bit aligned.
   1699 /// \returns A 128-bit vector of [4 x float] containing the loaded valus.
   1700 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1701 _mm_load_ps(const float *__p)
   1702 {
   1703   return *(__m128*)__p;
   1704 }
   1705 
   1706 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an
   1707 ///    unaligned memory location.
   1708 ///
   1709 /// \headerfile <x86intrin.h>
   1710 ///
   1711 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
   1712 ///
   1713 /// \param __p
   1714 ///    A pointer to a 128-bit memory location. The address of the memory
   1715 ///    location does not have to be aligned.
   1716 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
   1717 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1718 _mm_loadu_ps(const float *__p)
   1719 {
   1720   struct __loadu_ps {
   1721     __m128 __v;
   1722   } __attribute__((__packed__, __may_alias__));
   1723   return ((struct __loadu_ps*)__p)->__v;
   1724 }
   1725 
   1726 /// \brief Loads four packed float values, in reverse order, from an aligned
   1727 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
   1728 ///
   1729 /// \headerfile <x86intrin.h>
   1730 ///
   1731 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
   1732 ///    instruction.
   1733 ///
   1734 /// \param __p
   1735 ///    A pointer to a 128-bit memory location. The address of the memory
   1736 ///    location has to be 128-bit aligned.
   1737 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
   1738 ///    in reverse order.
   1739 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1740 _mm_loadr_ps(const float *__p)
   1741 {
   1742   __m128 __a = _mm_load_ps(__p);
   1743   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
   1744 }
   1745 
   1746 /// \brief Create a 128-bit vector of [4 x float] with undefined values.
   1747 ///
   1748 /// \headerfile <x86intrin.h>
   1749 ///
   1750 /// This intrinsic has no corresponding instruction.
   1751 ///
   1752 /// \returns A 128-bit vector of [4 x float] containing undefined values.
   1753 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1754 _mm_undefined_ps(void)
   1755 {
   1756   return (__m128)__builtin_ia32_undef128();
   1757 }
   1758 
   1759 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
   1760 ///    32 bits of the vector are initialized with the specified single-precision
   1761 ///    floating-point value. The upper 96 bits are set to zero.
   1762 ///
   1763 /// \headerfile <x86intrin.h>
   1764 ///
   1765 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
   1766 ///
   1767 /// \param __w
   1768 ///    A single-precision floating-point value used to initialize the lower 32
   1769 ///    bits of the result.
   1770 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
   1771 ///    lower 32 bits contain the value provided in the source operand. The
   1772 ///    upper 96 bits are set to zero.
   1773 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1774 _mm_set_ss(float __w)
   1775 {
   1776   return (__m128){ __w, 0, 0, 0 };
   1777 }
   1778 
   1779 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
   1780 ///    of the four single-precision floating-point vector elements set to the
   1781 ///    specified single-precision floating-point value.
   1782 ///
   1783 /// \headerfile <x86intrin.h>
   1784 ///
   1785 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
   1786 ///
   1787 /// \param __w
   1788 ///    A single-precision floating-point value used to initialize each vector
   1789 ///    element of the result.
   1790 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1791 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1792 _mm_set1_ps(float __w)
   1793 {
   1794   return (__m128){ __w, __w, __w, __w };
   1795 }
   1796 
   1797 /* Microsoft specific. */
   1798 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
   1799 ///    of the four single-precision floating-point vector elements set to the
   1800 ///    specified single-precision floating-point value.
   1801 ///
   1802 /// \headerfile <x86intrin.h>
   1803 ///
   1804 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
   1805 ///
   1806 /// \param __w
   1807 ///    A single-precision floating-point value used to initialize each vector
   1808 ///    element of the result.
   1809 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1810 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1811 _mm_set_ps1(float __w)
   1812 {
   1813     return _mm_set1_ps(__w);
   1814 }
   1815 
   1816 /// \brief Constructs a 128-bit floating-point vector of [4 x float]
   1817 ///    initialized with the specified single-precision floating-point values.
   1818 ///
   1819 /// \headerfile <x86intrin.h>
   1820 ///
   1821 /// This intrinsic is a utility function and does not correspond to a specific
   1822 ///    instruction.
   1823 ///
   1824 /// \param __z
   1825 ///    A single-precision floating-point value used to initialize bits [127:96]
   1826 ///    of the result.
   1827 /// \param __y
   1828 ///    A single-precision floating-point value used to initialize bits [95:64]
   1829 ///    of the result.
   1830 /// \param __x
   1831 ///    A single-precision floating-point value used to initialize bits [63:32]
   1832 ///    of the result.
   1833 /// \param __w
   1834 ///    A single-precision floating-point value used to initialize bits [31:0]
   1835 ///    of the result.
   1836 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1837 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1838 _mm_set_ps(float __z, float __y, float __x, float __w)
   1839 {
   1840   return (__m128){ __w, __x, __y, __z };
   1841 }
   1842 
   1843 /// \brief Constructs a 128-bit floating-point vector of [4 x float],
   1844 ///    initialized in reverse order with the specified 32-bit single-precision
   1845 ///    float-point values.
   1846 ///
   1847 /// \headerfile <x86intrin.h>
   1848 ///
   1849 /// This intrinsic is a utility function and does not correspond to a specific
   1850 ///    instruction.
   1851 ///
   1852 /// \param __z
   1853 ///    A single-precision floating-point value used to initialize bits [31:0]
   1854 ///    of the result.
   1855 /// \param __y
   1856 ///    A single-precision floating-point value used to initialize bits [63:32]
   1857 ///    of the result.
   1858 /// \param __x
   1859 ///    A single-precision floating-point value used to initialize bits [95:64]
   1860 ///    of the result.
   1861 /// \param __w
   1862 ///    A single-precision floating-point value used to initialize bits [127:96]
   1863 ///    of the result.
   1864 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1865 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1866 _mm_setr_ps(float __z, float __y, float __x, float __w)
   1867 {
   1868   return (__m128){ __z, __y, __x, __w };
   1869 }
   1870 
   1871 /// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
   1872 ///    to zero.
   1873 ///
   1874 /// \headerfile <x86intrin.h>
   1875 ///
   1876 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
   1877 ///
   1878 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
   1879 ///    all elements set to zero.
   1880 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1881 _mm_setzero_ps(void)
   1882 {
   1883   return (__m128){ 0, 0, 0, 0 };
   1884 }
   1885 
   1886 /// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
   1887 ///    memory location.
   1888 ///
   1889 /// \headerfile <x86intrin.h>
   1890 ///
   1891 /// This intrinsic corresponds to the <c> VPEXTRQ / MOVQ </c> instruction.
   1892 ///
   1893 /// \param __p
   1894 ///    A pointer to a 64-bit memory location.
   1895 /// \param __a
   1896 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   1897 static __inline__ void __DEFAULT_FN_ATTRS
   1898 _mm_storeh_pi(__m64 *__p, __m128 __a)
   1899 {
   1900   __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
   1901 }
   1902 
   1903 /// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
   1904 ///     memory location.
   1905 ///
   1906 /// \headerfile <x86intrin.h>
   1907 ///
   1908 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
   1909 ///
   1910 /// \param __p
   1911 ///    A pointer to a memory location that will receive the float values.
   1912 /// \param __a
   1913 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   1914 static __inline__ void __DEFAULT_FN_ATTRS
   1915 _mm_storel_pi(__m64 *__p, __m128 __a)
   1916 {
   1917   __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
   1918 }
   1919 
   1920 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
   1921 ///     memory location.
   1922 ///
   1923 /// \headerfile <x86intrin.h>
   1924 ///
   1925 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
   1926 ///
   1927 /// \param __p
   1928 ///    A pointer to a 32-bit memory location.
   1929 /// \param __a
   1930 ///    A 128-bit vector of [4 x float] containing the value to be stored.
   1931 static __inline__ void __DEFAULT_FN_ATTRS
   1932 _mm_store_ss(float *__p, __m128 __a)
   1933 {
   1934   struct __mm_store_ss_struct {
   1935     float __u;
   1936   } __attribute__((__packed__, __may_alias__));
   1937   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
   1938 }
   1939 
   1940 /// \brief Stores a 128-bit vector of [4 x float] to an unaligned memory
   1941 ///    location.
   1942 ///
   1943 /// \headerfile <x86intrin.h>
   1944 ///
   1945 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
   1946 ///
   1947 /// \param __p
   1948 ///    A pointer to a 128-bit memory location. The address of the memory
   1949 ///    location does not have to be aligned.
   1950 /// \param __a
   1951 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   1952 static __inline__ void __DEFAULT_FN_ATTRS
   1953 _mm_storeu_ps(float *__p, __m128 __a)
   1954 {
   1955   struct __storeu_ps {
   1956     __m128 __v;
   1957   } __attribute__((__packed__, __may_alias__));
   1958   ((struct __storeu_ps*)__p)->__v = __a;
   1959 }
   1960 
   1961 /// \brief Stores a 128-bit vector of [4 x float] into an aligned memory
   1962 ///    location.
   1963 ///
   1964 /// \headerfile <x86intrin.h>
   1965 ///
   1966 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
   1967 ///
   1968 /// \param __p
   1969 ///    A pointer to a 128-bit memory location. The address of the memory
   1970 ///    location has to be 16-byte aligned.
   1971 /// \param __a
   1972 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   1973 static __inline__ void __DEFAULT_FN_ATTRS
   1974 _mm_store_ps(float *__p, __m128 __a)
   1975 {
   1976   *(__m128*)__p = __a;
   1977 }
   1978 
   1979 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
   1980 ///    four contiguous elements in an aligned memory location.
   1981 ///
   1982 /// \headerfile <x86intrin.h>
   1983 ///
   1984 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
   1985 ///    instruction.
   1986 ///
   1987 /// \param __p
   1988 ///    A pointer to a 128-bit memory location.
   1989 /// \param __a
   1990 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
   1991 ///    of the four contiguous elements pointed by \a __p.
   1992 static __inline__ void __DEFAULT_FN_ATTRS
   1993 _mm_store1_ps(float *__p, __m128 __a)
   1994 {
   1995   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
   1996   _mm_store_ps(__p, __a);
   1997 }
   1998 
   1999 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
   2000 ///    four contiguous elements in an aligned memory location.
   2001 ///
   2002 /// \headerfile <x86intrin.h>
   2003 ///
   2004 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
   2005 ///    instruction.
   2006 ///
   2007 /// \param __p
   2008 ///    A pointer to a 128-bit memory location.
   2009 /// \param __a
   2010 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
   2011 ///    of the four contiguous elements pointed by \a __p.
   2012 static __inline__ void __DEFAULT_FN_ATTRS
   2013 _mm_store_ps1(float *__p, __m128 __a)
   2014 {
   2015   return _mm_store1_ps(__p, __a);
   2016 }
   2017 
   2018 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
   2019 ///    aligned memory location in reverse order.
   2020 ///
   2021 /// \headerfile <x86intrin.h>
   2022 ///
   2023 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
   2024 ///    instruction.
   2025 ///
   2026 /// \param __p
   2027 ///    A pointer to a 128-bit memory location. The address of the memory
   2028 ///    location has to be 128-bit aligned.
   2029 /// \param __a
   2030 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   2031 static __inline__ void __DEFAULT_FN_ATTRS
   2032 _mm_storer_ps(float *__p, __m128 __a)
   2033 {
   2034   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
   2035   _mm_store_ps(__p, __a);
   2036 }
   2037 
   2038 #define _MM_HINT_T0 3
   2039 #define _MM_HINT_T1 2
   2040 #define _MM_HINT_T2 1
   2041 #define _MM_HINT_NTA 0
   2042 
   2043 #ifndef _MSC_VER
   2044 /* FIXME: We have to #define this because "sel" must be a constant integer, and
   2045    Sema doesn't do any form of constant propagation yet. */
   2046 
   2047 /// \brief Loads one cache line of data from the specified address to a location
   2048 ///    closer to the processor.
   2049 ///
   2050 /// \headerfile <x86intrin.h>
   2051 ///
   2052 /// \code
   2053 /// void _mm_prefetch(const void * a, const int sel);
   2054 /// \endcode
   2055 ///
   2056 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
   2057 ///
   2058 /// \param a
   2059 ///    A pointer to a memory location containing a cache line of data.
   2060 /// \param sel
   2061 ///    A predefined integer constant specifying the type of prefetch
   2062 ///    operation: \n
   2063 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
   2064 ///    PREFETCHNTA instruction will be generated. \n
   2065 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
   2066 ///    be generated. \n
   2067 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
   2068 ///    be generated. \n
   2069 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
   2070 ///    be generated.
   2071 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
   2072 #endif
   2073 
   2074 /// \brief Stores a 64-bit integer in the specified aligned memory location. To
   2075 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
   2076 ///    used again soon).
   2077 ///
   2078 /// \headerfile <x86intrin.h>
   2079 ///
   2080 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
   2081 ///
   2082 /// \param __p
   2083 ///    A pointer to an aligned memory location used to store the register value.
   2084 /// \param __a
   2085 ///    A 64-bit integer containing the value to be stored.
   2086 static __inline__ void __DEFAULT_FN_ATTRS
   2087 _mm_stream_pi(__m64 *__p, __m64 __a)
   2088 {
   2089   __builtin_ia32_movntq(__p, __a);
   2090 }
   2091 
   2092 /// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
   2093 ///    128-bit aligned memory location. To minimize caching, the data is flagged
   2094 ///    as non-temporal (unlikely to be used again soon).
   2095 ///
   2096 /// \headerfile <x86intrin.h>
   2097 ///
   2098 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
   2099 ///
   2100 /// \param __p
   2101 ///    A pointer to a 128-bit aligned memory location that will receive the
   2102 ///    single-precision floating-point values.
   2103 /// \param __a
   2104 ///    A 128-bit vector of [4 x float] containing the values to be moved.
   2105 static __inline__ void __DEFAULT_FN_ATTRS
   2106 _mm_stream_ps(float *__p, __m128 __a)
   2107 {
   2108   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
   2109 }
   2110 
   2111 #if defined(__cplusplus)
   2112 extern "C" {
   2113 #endif
   2114 
   2115 /// \brief Forces strong memory ordering (serialization) between store
   2116 ///    instructions preceding this instruction and store instructions following
   2117 ///    this instruction, ensuring the system completes all previous stores
   2118 ///    before executing subsequent stores.
   2119 ///
   2120 /// \headerfile <x86intrin.h>
   2121 ///
   2122 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
   2123 ///
   2124 void _mm_sfence(void);
   2125 
   2126 #if defined(__cplusplus)
   2127 } // extern "C"
   2128 #endif
   2129 
   2130 /// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
   2131 ///    returns it, as specified by the immediate integer operand.
   2132 ///
   2133 /// \headerfile <x86intrin.h>
   2134 ///
   2135 /// \code
   2136 /// int _mm_extract_pi16(__m64 a, int n);
   2137 /// \endcode
   2138 ///
   2139 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
   2140 ///
   2141 /// \param a
   2142 ///    A 64-bit vector of [4 x i16].
   2143 /// \param n
   2144 ///    An immediate integer operand that determines which bits are extracted: \n
   2145 ///    0: Bits [15:0] are copied to the destination. \n
   2146 ///    1: Bits [31:16] are copied to the destination. \n
   2147 ///    2: Bits [47:32] are copied to the destination. \n
   2148 ///    3: Bits [63:48] are copied to the destination.
   2149 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
   2150 #define _mm_extract_pi16(a, n) __extension__ ({ \
   2151   (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
   2152 
   2153 /// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
   2154 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
   2155 ///    specified by the immediate operand \a n.
   2156 ///
   2157 /// \headerfile <x86intrin.h>
   2158 ///
   2159 /// \code
   2160 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
   2161 /// \endcode
   2162 ///
   2163 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
   2164 ///
   2165 /// \param a
   2166 ///    A 64-bit vector of [4 x i16].
   2167 /// \param d
   2168 ///    An integer. The lower 16-bit value from this operand is written to the
   2169 ///    destination at the offset specified by operand \a n.
   2170 /// \param n
   2171 ///    An immediate integer operant that determines which the bits to be used
   2172 ///    in the destination. \n
   2173 ///    0: Bits [15:0] are copied to the destination. \n
   2174 ///    1: Bits [31:16] are copied to the destination. \n
   2175 ///    2: Bits [47:32] are copied to the destination. \n
   2176 ///    3: Bits [63:48] are copied to the destination.  \n
   2177 ///    The remaining bits in the destination are copied from the corresponding
   2178 ///    bits in operand \a a.
   2179 /// \returns A 64-bit integer vector containing the copied packed data from the
   2180 ///    operands.
   2181 #define _mm_insert_pi16(a, d, n) __extension__ ({ \
   2182   (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
   2183 
   2184 /// \brief Compares each of the corresponding packed 16-bit integer values of
   2185 ///    the 64-bit integer vectors, and writes the greater value to the
   2186 ///    corresponding bits in the destination.
   2187 ///
   2188 /// \headerfile <x86intrin.h>
   2189 ///
   2190 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
   2191 ///
   2192 /// \param __a
   2193 ///    A 64-bit integer vector containing one of the source operands.
   2194 /// \param __b
   2195 ///    A 64-bit integer vector containing one of the source operands.
   2196 /// \returns A 64-bit integer vector containing the comparison results.
   2197 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2198 _mm_max_pi16(__m64 __a, __m64 __b)
   2199 {
   2200   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
   2201 }
   2202 
   2203 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
   2204 ///    values of the 64-bit integer vectors, and writes the greater value to the
   2205 ///    corresponding bits in the destination.
   2206 ///
   2207 /// \headerfile <x86intrin.h>
   2208 ///
   2209 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
   2210 ///
   2211 /// \param __a
   2212 ///    A 64-bit integer vector containing one of the source operands.
   2213 /// \param __b
   2214 ///    A 64-bit integer vector containing one of the source operands.
   2215 /// \returns A 64-bit integer vector containing the comparison results.
   2216 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2217 _mm_max_pu8(__m64 __a, __m64 __b)
   2218 {
   2219   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
   2220 }
   2221 
   2222 /// \brief Compares each of the corresponding packed 16-bit integer values of
   2223 ///    the 64-bit integer vectors, and writes the lesser value to the
   2224 ///    corresponding bits in the destination.
   2225 ///
   2226 /// \headerfile <x86intrin.h>
   2227 ///
   2228 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
   2229 ///
   2230 /// \param __a
   2231 ///    A 64-bit integer vector containing one of the source operands.
   2232 /// \param __b
   2233 ///    A 64-bit integer vector containing one of the source operands.
   2234 /// \returns A 64-bit integer vector containing the comparison results.
   2235 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2236 _mm_min_pi16(__m64 __a, __m64 __b)
   2237 {
   2238   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
   2239 }
   2240 
   2241 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
   2242 ///    values of the 64-bit integer vectors, and writes the lesser value to the
   2243 ///    corresponding bits in the destination.
   2244 ///
   2245 /// \headerfile <x86intrin.h>
   2246 ///
   2247 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
   2248 ///
   2249 /// \param __a
   2250 ///    A 64-bit integer vector containing one of the source operands.
   2251 /// \param __b
   2252 ///    A 64-bit integer vector containing one of the source operands.
   2253 /// \returns A 64-bit integer vector containing the comparison results.
   2254 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2255 _mm_min_pu8(__m64 __a, __m64 __b)
   2256 {
   2257   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
   2258 }
   2259 
   2260 /// \brief Takes the most significant bit from each 8-bit element in a 64-bit
   2261 ///    integer vector to create a 16-bit mask value. Zero-extends the value to
   2262 ///    32-bit integer and writes it to the destination.
   2263 ///
   2264 /// \headerfile <x86intrin.h>
   2265 ///
   2266 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
   2267 ///
   2268 /// \param __a
   2269 ///    A 64-bit integer vector containing the values with bits to be extracted.
   2270 /// \returns The most significant bit from each 8-bit element in the operand,
   2271 ///    written to bits [15:0].
   2272 static __inline__ int __DEFAULT_FN_ATTRS
   2273 _mm_movemask_pi8(__m64 __a)
   2274 {
   2275   return __builtin_ia32_pmovmskb((__v8qi)__a);
   2276 }
   2277 
   2278 /// \brief Multiplies packed 16-bit unsigned integer values and writes the
   2279 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
   2280 ///    the destination.
   2281 ///
   2282 /// \headerfile <x86intrin.h>
   2283 ///
   2284 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
   2285 ///
   2286 /// \param __a
   2287 ///    A 64-bit integer vector containing one of the source operands.
   2288 /// \param __b
   2289 ///    A 64-bit integer vector containing one of the source operands.
   2290 /// \returns A 64-bit integer vector containing the products of both operands.
   2291 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2292 _mm_mulhi_pu16(__m64 __a, __m64 __b)
   2293 {
   2294   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
   2295 }
   2296 
   2297 /// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
   2298 ///    destination, as specified by the immediate value operand.
   2299 ///
   2300 /// \headerfile <x86intrin.h>
   2301 ///
   2302 /// \code
   2303 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
   2304 /// \endcode
   2305 ///
   2306 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
   2307 ///
   2308 /// \param a
   2309 ///    A 64-bit integer vector containing the values to be shuffled.
   2310 /// \param n
   2311 ///    An immediate value containing an 8-bit value specifying which elements to
   2312 ///    copy from \a a. The destinations within the 64-bit destination are
   2313 ///    assigned values as follows: \n
   2314 ///    Bits [1:0] are used to assign values to bits [15:0] in the
   2315 ///    destination. \n
   2316 ///    Bits [3:2] are used to assign values to bits [31:16] in the
   2317 ///    destination. \n
   2318 ///    Bits [5:4] are used to assign values to bits [47:32] in the
   2319 ///    destination. \n
   2320 ///    Bits [7:6] are used to assign values to bits [63:48] in the
   2321 ///    destination. \n
   2322 ///    Bit value assignments: \n
   2323 ///    00: assigned from bits [15:0] of \a a. \n
   2324 ///    01: assigned from bits [31:16] of \a a. \n
   2325 ///    10: assigned from bits [47:32] of \a a. \n
   2326 ///    11: assigned from bits [63:48] of \a a.
   2327 /// \returns A 64-bit integer vector containing the shuffled values.
   2328 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
   2329   (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
   2330 
   2331 /// \brief Conditionally copies the values from each 8-bit element in the first
   2332 ///    64-bit integer vector operand to the specified memory location, as
   2333 ///    specified by the most significant bit in the corresponding element in the
   2334 ///    second 64-bit integer vector operand.
   2335 ///
   2336 ///    To minimize caching, the data is flagged as non-temporal
   2337 ///    (unlikely to be used again soon).
   2338 ///
   2339 /// \headerfile <x86intrin.h>
   2340 ///
   2341 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
   2342 ///
   2343 /// \param __d
   2344 ///    A 64-bit integer vector containing the values with elements to be copied.
   2345 /// \param __n
   2346 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
   2347 ///    element determines whether the corresponding element in operand \a __d
   2348 ///    is copied. If the most significant bit of a given element is 1, the
   2349 ///    corresponding element in operand \a __d is copied.
   2350 /// \param __p
   2351 ///    A pointer to a 64-bit memory location that will receive the conditionally
   2352 ///    copied integer values. The address of the memory location does not have
   2353 ///    to be aligned.
   2354 static __inline__ void __DEFAULT_FN_ATTRS
   2355 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
   2356 {
   2357   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
   2358 }
   2359 
   2360 /// \brief Computes the rounded averages of the packed unsigned 8-bit integer
   2361 ///    values and writes the averages to the corresponding bits in the
   2362 ///    destination.
   2363 ///
   2364 /// \headerfile <x86intrin.h>
   2365 ///
   2366 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
   2367 ///
   2368 /// \param __a
   2369 ///    A 64-bit integer vector containing one of the source operands.
   2370 /// \param __b
   2371 ///    A 64-bit integer vector containing one of the source operands.
   2372 /// \returns A 64-bit integer vector containing the averages of both operands.
   2373 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2374 _mm_avg_pu8(__m64 __a, __m64 __b)
   2375 {
   2376   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
   2377 }
   2378 
   2379 /// \brief Computes the rounded averages of the packed unsigned 16-bit integer
   2380 ///    values and writes the averages to the corresponding bits in the
   2381 ///    destination.
   2382 ///
   2383 /// \headerfile <x86intrin.h>
   2384 ///
   2385 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
   2386 ///
   2387 /// \param __a
   2388 ///    A 64-bit integer vector containing one of the source operands.
   2389 /// \param __b
   2390 ///    A 64-bit integer vector containing one of the source operands.
   2391 /// \returns A 64-bit integer vector containing the averages of both operands.
   2392 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2393 _mm_avg_pu16(__m64 __a, __m64 __b)
   2394 {
   2395   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
   2396 }
   2397 
   2398 /// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
   2399 ///    64-bit vector operands and computes the absolute value for each of the
   2400 ///    difference. Then sum of the 8 absolute differences is written to the
   2401 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
   2402 ///
   2403 /// \headerfile <x86intrin.h>
   2404 ///
   2405 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
   2406 ///
   2407 /// \param __a
   2408 ///    A 64-bit integer vector containing one of the source operands.
   2409 /// \param __b
   2410 ///    A 64-bit integer vector containing one of the source operands.
   2411 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
   2412 ///    sets of absolute differences between both operands. The upper bits are
   2413 ///    cleared.
   2414 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2415 _mm_sad_pu8(__m64 __a, __m64 __b)
   2416 {
   2417   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
   2418 }
   2419 
   2420 #if defined(__cplusplus)
   2421 extern "C" {
   2422 #endif
   2423 
   2424 /// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
   2425 ///    integer value.
   2426 ///
   2427 ///    There are several groups of macros associated with this
   2428 ///    intrinsic, including:
   2429 ///    <ul>
   2430 ///    <li>
   2431 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
   2432 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
   2433 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
   2434 ///      _MM_GET_EXCEPTION_STATE().
   2435 ///    </li>
   2436 ///    <li>
   2437 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
   2438 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
   2439 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
   2440 ///    </li>
   2441 ///    <li>
   2442 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
   2443 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
   2444 ///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
   2445 ///    </li>
   2446 ///    <li>
   2447 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
   2448 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
   2449 ///    </li>
   2450 ///    <li>
   2451 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
   2452 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
   2453 ///      _MM_GET_DENORMALS_ZERO_MODE().
   2454 ///    </li>
   2455 ///    </ul>
   2456 ///
   2457 ///    For example, the expression below checks if an overflow exception has
   2458 ///    occurred:
   2459 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
   2460 ///
   2461 ///    The following example gets the current rounding mode:
   2462 ///      _MM_GET_ROUNDING_MODE()
   2463 ///
   2464 /// \headerfile <x86intrin.h>
   2465 ///
   2466 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
   2467 ///
   2468 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
   2469 ///    register.
   2470 unsigned int _mm_getcsr(void);
   2471 
   2472 /// \brief Sets the MXCSR register with the 32-bit unsigned integer value.
   2473 ///
   2474 ///    There are several groups of macros associated with this intrinsic,
   2475 ///    including:
   2476 ///    <ul>
   2477 ///    <li>
   2478 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
   2479 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
   2480 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
   2481 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
   2482 ///    </li>
   2483 ///    <li>
   2484 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
   2485 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
   2486 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
   2487 ///      of these macros.
   2488 ///    </li>
   2489 ///    <li>
   2490 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
   2491 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
   2492 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
   2493 ///    </li>
   2494 ///    <li>
   2495 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
   2496 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
   2497 ///      one of these macros.
   2498 ///    </li>
   2499 ///    <li>
   2500 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
   2501 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
   2502 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
   2503 ///    </li>
   2504 ///    </ul>
   2505 ///
   2506 ///    For example, the following expression causes subsequent floating-point
   2507 ///    operations to round up:
   2508 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
   2509 ///
   2510 ///    The following example sets the DAZ and FTZ flags:
   2511 ///      void setFlags() {
   2512 ///        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
   2513 ///        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
   2514 ///      }
   2515 ///
   2516 /// \headerfile <x86intrin.h>
   2517 ///
   2518 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
   2519 ///
   2520 /// \param __i
   2521 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
   2522 void _mm_setcsr(unsigned int __i);
   2523 
   2524 #if defined(__cplusplus)
   2525 } // extern "C"
   2526 #endif
   2527 
   2528 /// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
   2529 ///    specified by the immediate value operand.
   2530 ///
   2531 /// \headerfile <x86intrin.h>
   2532 ///
   2533 /// \code
   2534 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
   2535 /// \endcode
   2536 ///
   2537 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
   2538 ///
   2539 /// \param a
   2540 ///    A 128-bit vector of [4 x float].
   2541 /// \param b
   2542 ///    A 128-bit vector of [4 x float].
   2543 /// \param mask
   2544 ///    An immediate value containing an 8-bit value specifying which elements to
   2545 ///    copy from \a a and \a b. \n
   2546 ///    Bits [3:0] specify the values copied from operand \a a. \n
   2547 ///    Bits [7:4] specify the values copied from operand \a b. \n
   2548 ///    The destinations within the 128-bit destination are assigned values as
   2549 ///    follows: \n
   2550 ///    Bits [1:0] are used to assign values to bits [31:0] in the
   2551 ///    destination. \n
   2552 ///    Bits [3:2] are used to assign values to bits [63:32] in the
   2553 ///    destination. \n
   2554 ///    Bits [5:4] are used to assign values to bits [95:64] in the
   2555 ///    destination. \n
   2556 ///    Bits [7:6] are used to assign values to bits [127:96] in the
   2557 ///    destination. \n
   2558 ///    Bit value assignments: \n
   2559 ///    00: Bits [31:0] copied from the specified operand. \n
   2560 ///    01: Bits [63:32] copied from the specified operand. \n
   2561 ///    10: Bits [95:64] copied from the specified operand. \n
   2562 ///    11: Bits [127:96] copied from the specified operand.
   2563 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
   2564 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
   2565   (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
   2566                                   0 + (((mask) >> 0) & 0x3), \
   2567                                   0 + (((mask) >> 2) & 0x3), \
   2568                                   4 + (((mask) >> 4) & 0x3), \
   2569                                   4 + (((mask) >> 6) & 0x3)); })
   2570 
   2571 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
   2572 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
   2573 ///
   2574 /// \headerfile <x86intrin.h>
   2575 ///
   2576 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
   2577 ///
   2578 /// \param __a
   2579 ///    A 128-bit vector of [4 x float]. \n
   2580 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
   2581 ///    Bits [127:96] are written to bits [95:64] of the destination.
   2582 /// \param __b
   2583 ///    A 128-bit vector of [4 x float].
   2584 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
   2585 ///    Bits [127:96] are written to bits [127:96] of the destination.
   2586 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
   2587 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2588 _mm_unpackhi_ps(__m128 __a, __m128 __b)
   2589 {
   2590   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
   2591 }
   2592 
   2593 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
   2594 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
   2595 ///
   2596 /// \headerfile <x86intrin.h>
   2597 ///
   2598 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
   2599 ///
   2600 /// \param __a
   2601 ///    A 128-bit vector of [4 x float]. \n
   2602 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
   2603 ///    Bits [63:32] are written to bits [95:64] of the destination.
   2604 /// \param __b
   2605 ///    A 128-bit vector of [4 x float]. \n
   2606 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
   2607 ///    Bits [63:32] are written to bits [127:96] of the destination.
   2608 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
   2609 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2610 _mm_unpacklo_ps(__m128 __a, __m128 __b)
   2611 {
   2612   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
   2613 }
   2614 
   2615 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
   2616 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
   2617 ///    96 bits are set to the upper 96 bits of the first parameter.
   2618 ///
   2619 /// \headerfile <x86intrin.h>
   2620 ///
   2621 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
   2622 ///
   2623 /// \param __a
   2624 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
   2625 ///    written to the upper 96 bits of the result.
   2626 /// \param __b
   2627 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
   2628 ///    written to the lower 32 bits of the result.
   2629 /// \returns A 128-bit floating-point vector of [4 x float].
   2630 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2631 _mm_move_ss(__m128 __a, __m128 __b)
   2632 {
   2633   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
   2634 }
   2635 
   2636 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
   2637 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
   2638 ///    64 bits are set to the upper 64 bits of the first parameter.
   2639 ///
   2640 /// \headerfile <x86intrin.h>
   2641 ///
   2642 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
   2643 ///
   2644 /// \param __a
   2645 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
   2646 ///    written to the upper 64 bits of the result.
   2647 /// \param __b
   2648 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
   2649 ///    written to the lower 64 bits of the result.
   2650 /// \returns A 128-bit floating-point vector of [4 x float].
   2651 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2652 _mm_movehl_ps(__m128 __a, __m128 __b)
   2653 {
   2654   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
   2655 }
   2656 
   2657 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
   2658 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
   2659 ///    64 bits are set to the lower 64 bits of the second parameter.
   2660 ///
   2661 /// \headerfile <x86intrin.h>
   2662 ///
   2663 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
   2664 ///
   2665 /// \param __a
   2666 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
   2667 ///    written to the lower 64 bits of the result.
   2668 /// \param __b
   2669 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
   2670 ///    written to the upper 64 bits of the result.
   2671 /// \returns A 128-bit floating-point vector of [4 x float].
   2672 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2673 _mm_movelh_ps(__m128 __a, __m128 __b)
   2674 {
   2675   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
   2676 }
   2677 
   2678 /// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
   2679 ///    float].
   2680 ///
   2681 /// \headerfile <x86intrin.h>
   2682 ///
   2683 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
   2684 ///
   2685 /// \param __a
   2686 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
   2687 ///    from the corresponding elements in this operand.
   2688 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2689 ///    values from the operand.
   2690 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2691 _mm_cvtpi16_ps(__m64 __a)
   2692 {
   2693   __m64 __b, __c;
   2694   __m128 __r;
   2695 
   2696   __b = _mm_setzero_si64();
   2697   __b = _mm_cmpgt_pi16(__b, __a);
   2698   __c = _mm_unpackhi_pi16(__a, __b);
   2699   __r = _mm_setzero_ps();
   2700   __r = _mm_cvtpi32_ps(__r, __c);
   2701   __r = _mm_movelh_ps(__r, __r);
   2702   __c = _mm_unpacklo_pi16(__a, __b);
   2703   __r = _mm_cvtpi32_ps(__r, __c);
   2704 
   2705   return __r;
   2706 }
   2707 
   2708 /// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
   2709 ///    128-bit vector of [4 x float].
   2710 ///
   2711 /// \headerfile <x86intrin.h>
   2712 ///
   2713 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
   2714 ///
   2715 /// \param __a
   2716 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
   2717 ///    destination are copied from the corresponding elements in this operand.
   2718 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2719 ///    values from the operand.
   2720 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2721 _mm_cvtpu16_ps(__m64 __a)
   2722 {
   2723   __m64 __b, __c;
   2724   __m128 __r;
   2725 
   2726   __b = _mm_setzero_si64();
   2727   __c = _mm_unpackhi_pi16(__a, __b);
   2728   __r = _mm_setzero_ps();
   2729   __r = _mm_cvtpi32_ps(__r, __c);
   2730   __r = _mm_movelh_ps(__r, __r);
   2731   __c = _mm_unpacklo_pi16(__a, __b);
   2732   __r = _mm_cvtpi32_ps(__r, __c);
   2733 
   2734   return __r;
   2735 }
   2736 
   2737 /// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
   2738 ///    into a 128-bit vector of [4 x float].
   2739 ///
   2740 /// \headerfile <x86intrin.h>
   2741 ///
   2742 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
   2743 ///
   2744 /// \param __a
   2745 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
   2746 ///    from the corresponding lower 4 elements in this operand.
   2747 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2748 ///    values from the operand.
   2749 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2750 _mm_cvtpi8_ps(__m64 __a)
   2751 {
   2752   __m64 __b;
   2753 
   2754   __b = _mm_setzero_si64();
   2755   __b = _mm_cmpgt_pi8(__b, __a);
   2756   __b = _mm_unpacklo_pi8(__a, __b);
   2757 
   2758   return _mm_cvtpi16_ps(__b);
   2759 }
   2760 
   2761 /// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
   2762 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
   2763 ///
   2764 /// \headerfile <x86intrin.h>
   2765 ///
   2766 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
   2767 ///
   2768 /// \param __a
   2769 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
   2770 ///    destination are copied from the corresponding lower 4 elements in this
   2771 ///    operand.
   2772 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2773 ///    values from the source operand.
   2774 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2775 _mm_cvtpu8_ps(__m64 __a)
   2776 {
   2777   __m64 __b;
   2778 
   2779   __b = _mm_setzero_si64();
   2780   __b = _mm_unpacklo_pi8(__a, __b);
   2781 
   2782   return _mm_cvtpi16_ps(__b);
   2783 }
   2784 
   2785 /// \brief Converts the two 32-bit signed integer values from each 64-bit vector
   2786 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
   2787 ///
   2788 /// \headerfile <x86intrin.h>
   2789 ///
   2790 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
   2791 ///
   2792 /// \param __a
   2793 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
   2794 ///    copied from the elements in this operand.
   2795 /// \param __b
   2796 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
   2797 ///    copied from the elements in this operand.
   2798 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
   2799 ///    copied and converted values from the first operand. The upper 64 bits
   2800 ///    contain the copied and converted values from the second operand.
   2801 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2802 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
   2803 {
   2804   __m128 __c;
   2805 
   2806   __c = _mm_setzero_ps();
   2807   __c = _mm_cvtpi32_ps(__c, __b);
   2808   __c = _mm_movelh_ps(__c, __c);
   2809 
   2810   return _mm_cvtpi32_ps(__c, __a);
   2811 }
   2812 
   2813 /// \brief Converts each single-precision floating-point element of a 128-bit
   2814 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
   2815 ///    packs the results into a 64-bit integer vector of [4 x i16].
   2816 ///
   2817 ///    If the floating-point element is NaN or infinity, or if the
   2818 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
   2819 ///    it is converted to 0x8000. Otherwise if the floating-point element is
   2820 ///    greater than 0x7FFF, it is converted to 0x7FFF.
   2821 ///
   2822 /// \headerfile <x86intrin.h>
   2823 ///
   2824 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
   2825 ///
   2826 /// \param __a
   2827 ///    A 128-bit floating-point vector of [4 x float].
   2828 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
   2829 ///    values.
   2830 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2831 _mm_cvtps_pi16(__m128 __a)
   2832 {
   2833   __m64 __b, __c;
   2834 
   2835   __b = _mm_cvtps_pi32(__a);
   2836   __a = _mm_movehl_ps(__a, __a);
   2837   __c = _mm_cvtps_pi32(__a);
   2838 
   2839   return _mm_packs_pi32(__b, __c);
   2840 }
   2841 
   2842 /// \brief Converts each single-precision floating-point element of a 128-bit
   2843 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
   2844 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
   2845 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
   2846 ///
   2847 ///    If the floating-point element is NaN or infinity, or if the
   2848 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
   2849 ///    is converted to 0x80. Otherwise if the floating-point element is greater
   2850 ///    than 0x7F, it is converted to 0x7F.
   2851 ///
   2852 /// \headerfile <x86intrin.h>
   2853 ///
   2854 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
   2855 ///
   2856 /// \param __a
   2857 ///    128-bit floating-point vector of [4 x float].
   2858 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
   2859 ///    converted values and the uppper 32 bits are set to zero.
   2860 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2861 _mm_cvtps_pi8(__m128 __a)
   2862 {
   2863   __m64 __b, __c;
   2864 
   2865   __b = _mm_cvtps_pi16(__a);
   2866   __c = _mm_setzero_si64();
   2867 
   2868   return _mm_packs_pi16(__b, __c);
   2869 }
   2870 
   2871 /// \brief Extracts the sign bits from each single-precision floating-point
   2872 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
   2873 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
   2874 ///    to zero.
   2875 ///
   2876 /// \headerfile <x86intrin.h>
   2877 ///
   2878 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
   2879 ///
   2880 /// \param __a
   2881 ///    A 128-bit floating-point vector of [4 x float].
   2882 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
   2883 ///    single-precision floating-point element of the parameter. Bits [31:4] are
   2884 ///    set to zero.
   2885 static __inline__ int __DEFAULT_FN_ATTRS
   2886 _mm_movemask_ps(__m128 __a)
   2887 {
   2888   return __builtin_ia32_movmskps((__v4sf)__a);
   2889 }
   2890 
   2891 
   2892 #define _MM_ALIGN16 __attribute__((aligned(16)))
   2893 
   2894 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
   2895 
   2896 #define _MM_EXCEPT_INVALID    (0x0001)
   2897 #define _MM_EXCEPT_DENORM     (0x0002)
   2898 #define _MM_EXCEPT_DIV_ZERO   (0x0004)
   2899 #define _MM_EXCEPT_OVERFLOW   (0x0008)
   2900 #define _MM_EXCEPT_UNDERFLOW  (0x0010)
   2901 #define _MM_EXCEPT_INEXACT    (0x0020)
   2902 #define _MM_EXCEPT_MASK       (0x003f)
   2903 
   2904 #define _MM_MASK_INVALID      (0x0080)
   2905 #define _MM_MASK_DENORM       (0x0100)
   2906 #define _MM_MASK_DIV_ZERO     (0x0200)
   2907 #define _MM_MASK_OVERFLOW     (0x0400)
   2908 #define _MM_MASK_UNDERFLOW    (0x0800)
   2909 #define _MM_MASK_INEXACT      (0x1000)
   2910 #define _MM_MASK_MASK         (0x1f80)
   2911 
   2912 #define _MM_ROUND_NEAREST     (0x0000)
   2913 #define _MM_ROUND_DOWN        (0x2000)
   2914 #define _MM_ROUND_UP          (0x4000)
   2915 #define _MM_ROUND_TOWARD_ZERO (0x6000)
   2916 #define _MM_ROUND_MASK        (0x6000)
   2917 
   2918 #define _MM_FLUSH_ZERO_MASK   (0x8000)
   2919 #define _MM_FLUSH_ZERO_ON     (0x8000)
   2920 #define _MM_FLUSH_ZERO_OFF    (0x0000)
   2921 
   2922 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
   2923 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
   2924 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
   2925 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
   2926 
   2927 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
   2928 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
   2929 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
   2930 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
   2931 
   2932 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
   2933 do { \
   2934   __m128 tmp3, tmp2, tmp1, tmp0; \
   2935   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
   2936   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
   2937   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
   2938   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
   2939   (row0) = _mm_movelh_ps(tmp0, tmp2); \
   2940   (row1) = _mm_movehl_ps(tmp2, tmp0); \
   2941   (row2) = _mm_movelh_ps(tmp1, tmp3); \
   2942   (row3) = _mm_movehl_ps(tmp3, tmp1); \
   2943 } while (0)
   2944 
   2945 /* Aliases for compatibility. */
   2946 #define _m_pextrw _mm_extract_pi16
   2947 #define _m_pinsrw _mm_insert_pi16
   2948 #define _m_pmaxsw _mm_max_pi16
   2949 #define _m_pmaxub _mm_max_pu8
   2950 #define _m_pminsw _mm_min_pi16
   2951 #define _m_pminub _mm_min_pu8
   2952 #define _m_pmovmskb _mm_movemask_pi8
   2953 #define _m_pmulhuw _mm_mulhi_pu16
   2954 #define _m_pshufw _mm_shuffle_pi16
   2955 #define _m_maskmovq _mm_maskmove_si64
   2956 #define _m_pavgb _mm_avg_pu8
   2957 #define _m_pavgw _mm_avg_pu16
   2958 #define _m_psadbw _mm_sad_pu8
   2959 #define _m_ _mm_
   2960 #define _m_ _mm_
   2961 
   2962 #undef __DEFAULT_FN_ATTRS
   2963 
   2964 /* Ugly hack for backwards-compatibility (compatible with gcc) */
   2965 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
   2966 #include <emmintrin.h>
   2967 #endif
   2968 
   2969 #endif /* __XMMINTRIN_H */
   2970