Home | History | Annotate | Download | only in include
      1 /*===---- mmintrin.h - MMX intrinsics --------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __MMINTRIN_H
     25 #define __MMINTRIN_H
     26 
     27 typedef long long __m64 __attribute__((__vector_size__(8)));
     28 
     29 typedef long long __v1di __attribute__((__vector_size__(8)));
     30 typedef int __v2si __attribute__((__vector_size__(8)));
     31 typedef short __v4hi __attribute__((__vector_size__(8)));
     32 typedef char __v8qi __attribute__((__vector_size__(8)));
     33 
     34 /* Define the default attributes for the functions in this file. */
     35 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
     36 
     37 /// \brief Clears the MMX state by setting the state of the x87 stack registers
     38 ///    to empty.
     39 ///
     40 /// \headerfile <x86intrin.h>
     41 ///
     42 /// This intrinsic corresponds to the <c> EMMS </c> instruction.
     43 ///
     44 static __inline__ void __DEFAULT_FN_ATTRS
     45 _mm_empty(void)
     46 {
     47     __builtin_ia32_emms();
     48 }
     49 
     50 /// \brief Constructs a 64-bit integer vector, setting the lower 32 bits to the
     51 ///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
     52 ///
     53 /// \headerfile <x86intrin.h>
     54 ///
     55 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
     56 ///
     57 /// \param __i
     58 ///    A 32-bit integer value.
     59 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
     60 ///    parameter. The upper 32 bits are set to 0.
     61 static __inline__ __m64 __DEFAULT_FN_ATTRS
     62 _mm_cvtsi32_si64(int __i)
     63 {
     64     return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
     65 }
     66 
     67 /// \brief Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
     68 ///    signed integer.
     69 ///
     70 /// \headerfile <x86intrin.h>
     71 ///
     72 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
     73 ///
     74 /// \param __m
     75 ///    A 64-bit integer vector.
     76 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
     77 ///    parameter.
     78 static __inline__ int __DEFAULT_FN_ATTRS
     79 _mm_cvtsi64_si32(__m64 __m)
     80 {
     81     return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
     82 }
     83 
     84 /// \brief Casts a 64-bit signed integer value into a 64-bit integer vector.
     85 ///
     86 /// \headerfile <x86intrin.h>
     87 ///
     88 /// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
     89 ///
     90 /// \param __i
     91 ///    A 64-bit signed integer.
     92 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
     93 ///    parameter.
     94 static __inline__ __m64 __DEFAULT_FN_ATTRS
     95 _mm_cvtsi64_m64(long long __i)
     96 {
     97     return (__m64)__i;
     98 }
     99 
    100 /// \brief Casts a 64-bit integer vector into a 64-bit signed integer value.
    101 ///
    102 /// \headerfile <x86intrin.h>
    103 ///
    104 /// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
    105 ///
    106 /// \param __m
    107 ///    A 64-bit integer vector.
    108 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
    109 ///    parameter.
    110 static __inline__ long long __DEFAULT_FN_ATTRS
    111 _mm_cvtm64_si64(__m64 __m)
    112 {
    113     return (long long)__m;
    114 }
    115 
    116 /// \brief Converts 16-bit signed integers from both 64-bit integer vector
    117 ///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
    118 ///    a 64-bit integer vector of [8 x i8] as the result. Positive values
    119 ///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
    120 ///    are saturated to 0x80.
    121 ///
    122 /// \headerfile <x86intrin.h>
    123 ///
    124 /// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
    125 ///
    126 /// \param __m1
    127 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
    128 ///    16-bit signed integer and is converted to an 8-bit signed integer with
    129 ///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
    130 ///    Negative values less than 0x80 are saturated to 0x80. The converted
    131 ///    [4 x i8] values are written to the lower 32 bits of the result.
    132 /// \param __m2
    133 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
    134 ///    16-bit signed integer and is converted to an 8-bit signed integer with
    135 ///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
    136 ///    Negative values less than 0x80 are saturated to 0x80. The converted
    137 ///    [4 x i8] values are written to the upper 32 bits of the result.
    138 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
    139 ///    values.
    140 static __inline__ __m64 __DEFAULT_FN_ATTRS
    141 _mm_packs_pi16(__m64 __m1, __m64 __m2)
    142 {
    143     return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
    144 }
    145 
    146 /// \brief Converts 32-bit signed integers from both 64-bit integer vector
    147 ///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
    148 ///    a 64-bit integer vector of [4 x i16] as the result. Positive values
    149 ///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
    150 ///    0x8000 are saturated to 0x8000.
    151 ///
    152 /// \headerfile <x86intrin.h>
    153 ///
    154 /// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
    155 ///
    156 /// \param __m1
    157 ///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
    158 ///    32-bit signed integer and is converted to a 16-bit signed integer with
    159 ///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
    160 ///    Negative values less than 0x8000 are saturated to 0x8000. The converted
    161 ///    [2 x i16] values are written to the lower 32 bits of the result.
    162 /// \param __m2
    163 ///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
    164 ///    32-bit signed integer and is converted to a 16-bit signed integer with
    165 ///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
    166 ///    Negative values less than 0x8000 are saturated to 0x8000. The converted
    167 ///    [2 x i16] values are written to the upper 32 bits of the result.
    168 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
    169 ///    values.
    170 static __inline__ __m64 __DEFAULT_FN_ATTRS
    171 _mm_packs_pi32(__m64 __m1, __m64 __m2)
    172 {
    173     return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
    174 }
    175 
    176 /// \brief Converts 16-bit signed integers from both 64-bit integer vector
    177 ///    parameters of [4 x i16] into 8-bit unsigned integer values, and
    178 ///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
    179 ///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
    180 ///    to 0.
    181 ///
    182 /// \headerfile <x86intrin.h>
    183 ///
    184 /// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
    185 ///
    186 /// \param __m1
    187 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
    188 ///    16-bit signed integer and is converted to an 8-bit unsigned integer with
    189 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
    190 ///    than 0 are saturated to 0. The converted [4 x i8] values are written to
    191 ///    the lower 32 bits of the result.
    192 /// \param __m2
    193 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
    194 ///    16-bit signed integer and is converted to an 8-bit unsigned integer with
    195 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
    196 ///    than 0 are saturated to 0. The converted [4 x i8] values are written to
    197 ///    the upper 32 bits of the result.
    198 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
    199 ///    values.
    200 static __inline__ __m64 __DEFAULT_FN_ATTRS
    201 _mm_packs_pu16(__m64 __m1, __m64 __m2)
    202 {
    203     return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
    204 }
    205 
    206 /// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
    207 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
    208 ///
    209 /// \headerfile <x86intrin.h>
    210 ///
    211 /// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
    212 ///
    213 /// \param __m1
    214 ///    A 64-bit integer vector of [8 x i8]. \n
    215 ///    Bits [39:32] are written to bits [7:0] of the result. \n
    216 ///    Bits [47:40] are written to bits [23:16] of the result. \n
    217 ///    Bits [55:48] are written to bits [39:32] of the result. \n
    218 ///    Bits [63:56] are written to bits [55:48] of the result.
    219 /// \param __m2
    220 ///    A 64-bit integer vector of [8 x i8].
    221 ///    Bits [39:32] are written to bits [15:8] of the result. \n
    222 ///    Bits [47:40] are written to bits [31:24] of the result. \n
    223 ///    Bits [55:48] are written to bits [47:40] of the result. \n
    224 ///    Bits [63:56] are written to bits [63:56] of the result.
    225 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
    226 ///    values.
    227 static __inline__ __m64 __DEFAULT_FN_ATTRS
    228 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
    229 {
    230     return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
    231 }
    232 
    233 /// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
    234 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
    235 ///
    236 /// \headerfile <x86intrin.h>
    237 ///
    238 /// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
    239 ///
    240 /// \param __m1
    241 ///    A 64-bit integer vector of [4 x i16].
    242 ///    Bits [47:32] are written to bits [15:0] of the result. \n
    243 ///    Bits [63:48] are written to bits [47:32] of the result.
    244 /// \param __m2
    245 ///    A 64-bit integer vector of [4 x i16].
    246 ///    Bits [47:32] are written to bits [31:16] of the result. \n
    247 ///    Bits [63:48] are written to bits [63:48] of the result.
    248 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
    249 ///    values.
    250 static __inline__ __m64 __DEFAULT_FN_ATTRS
    251 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
    252 {
    253     return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
    254 }
    255 
    256 /// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
    257 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
    258 ///
    259 /// \headerfile <x86intrin.h>
    260 ///
    261 /// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
    262 ///
    263 /// \param __m1
    264 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
    265 ///    the lower 32 bits of the result.
    266 /// \param __m2
    267 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
    268 ///    the upper 32 bits of the result.
    269 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
    270 ///    values.
    271 static __inline__ __m64 __DEFAULT_FN_ATTRS
    272 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
    273 {
    274     return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
    275 }
    276 
    277 /// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
    278 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
    279 ///
    280 /// \headerfile <x86intrin.h>
    281 ///
    282 /// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
    283 ///
    284 /// \param __m1
    285 ///    A 64-bit integer vector of [8 x i8].
    286 ///    Bits [7:0] are written to bits [7:0] of the result. \n
    287 ///    Bits [15:8] are written to bits [23:16] of the result. \n
    288 ///    Bits [23:16] are written to bits [39:32] of the result. \n
    289 ///    Bits [31:24] are written to bits [55:48] of the result.
    290 /// \param __m2
    291 ///    A 64-bit integer vector of [8 x i8].
    292 ///    Bits [7:0] are written to bits [15:8] of the result. \n
    293 ///    Bits [15:8] are written to bits [31:24] of the result. \n
    294 ///    Bits [23:16] are written to bits [47:40] of the result. \n
    295 ///    Bits [31:24] are written to bits [63:56] of the result.
    296 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
    297 ///    values.
    298 static __inline__ __m64 __DEFAULT_FN_ATTRS
    299 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
    300 {
    301     return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
    302 }
    303 
    304 /// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
    305 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
    306 ///
    307 /// \headerfile <x86intrin.h>
    308 ///
    309 /// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
    310 ///
    311 /// \param __m1
    312 ///    A 64-bit integer vector of [4 x i16].
    313 ///    Bits [15:0] are written to bits [15:0] of the result. \n
    314 ///    Bits [31:16] are written to bits [47:32] of the result.
    315 /// \param __m2
    316 ///    A 64-bit integer vector of [4 x i16].
    317 ///    Bits [15:0] are written to bits [31:16] of the result. \n
    318 ///    Bits [31:16] are written to bits [63:48] of the result.
    319 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
    320 ///    values.
    321 static __inline__ __m64 __DEFAULT_FN_ATTRS
    322 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
    323 {
    324     return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
    325 }
    326 
    327 /// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
    328 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
    329 ///
    330 /// \headerfile <x86intrin.h>
    331 ///
    332 /// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
    333 ///
    334 /// \param __m1
    335 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
    336 ///    the lower 32 bits of the result.
    337 /// \param __m2
    338 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
    339 ///    the upper 32 bits of the result.
    340 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
    341 ///    values.
    342 static __inline__ __m64 __DEFAULT_FN_ATTRS
    343 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
    344 {
    345     return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
    346 }
    347 
    348 /// \brief Adds each 8-bit integer element of the first 64-bit integer vector
    349 ///    of [8 x i8] to the corresponding 8-bit integer element of the second
    350 ///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
    351 ///    packed into a 64-bit integer vector of [8 x i8].
    352 ///
    353 /// \headerfile <x86intrin.h>
    354 ///
    355 /// This intrinsic corresponds to the <c> PADDB </c> instruction.
    356 ///
    357 /// \param __m1
    358 ///    A 64-bit integer vector of [8 x i8].
    359 /// \param __m2
    360 ///    A 64-bit integer vector of [8 x i8].
    361 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
    362 ///    parameters.
    363 static __inline__ __m64 __DEFAULT_FN_ATTRS
    364 _mm_add_pi8(__m64 __m1, __m64 __m2)
    365 {
    366     return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
    367 }
    368 
    369 /// \brief Adds each 16-bit integer element of the first 64-bit integer vector
    370 ///    of [4 x i16] to the corresponding 16-bit integer element of the second
    371 ///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
    372 ///    packed into a 64-bit integer vector of [4 x i16].
    373 ///
    374 /// \headerfile <x86intrin.h>
    375 ///
    376 /// This intrinsic corresponds to the <c> PADDW </c> instruction.
    377 ///
    378 /// \param __m1
    379 ///    A 64-bit integer vector of [4 x i16].
    380 /// \param __m2
    381 ///    A 64-bit integer vector of [4 x i16].
    382 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
    383 ///    parameters.
    384 static __inline__ __m64 __DEFAULT_FN_ATTRS
    385 _mm_add_pi16(__m64 __m1, __m64 __m2)
    386 {
    387     return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
    388 }
    389 
    390 /// \brief Adds each 32-bit integer element of the first 64-bit integer vector
    391 ///    of [2 x i32] to the corresponding 32-bit integer element of the second
    392 ///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
    393 ///    packed into a 64-bit integer vector of [2 x i32].
    394 ///
    395 /// \headerfile <x86intrin.h>
    396 ///
    397 /// This intrinsic corresponds to the <c> PADDD </c> instruction.
    398 ///
    399 /// \param __m1
    400 ///    A 64-bit integer vector of [2 x i32].
    401 /// \param __m2
    402 ///    A 64-bit integer vector of [2 x i32].
    403 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
    404 ///    parameters.
    405 static __inline__ __m64 __DEFAULT_FN_ATTRS
    406 _mm_add_pi32(__m64 __m1, __m64 __m2)
    407 {
    408     return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
    409 }
    410 
    411 /// \brief Adds each 8-bit signed integer element of the first 64-bit integer
    412 ///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
    413 ///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
    414 ///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
    415 ///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
    416 ///
    417 /// \headerfile <x86intrin.h>
    418 ///
    419 /// This intrinsic corresponds to the <c> PADDSB </c> instruction.
    420 ///
    421 /// \param __m1
    422 ///    A 64-bit integer vector of [8 x i8].
    423 /// \param __m2
    424 ///    A 64-bit integer vector of [8 x i8].
    425 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
    426 ///    of both parameters.
    427 static __inline__ __m64 __DEFAULT_FN_ATTRS
    428 _mm_adds_pi8(__m64 __m1, __m64 __m2)
    429 {
    430     return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
    431 }
    432 
    433 /// \brief Adds each 16-bit signed integer element of the first 64-bit integer
    434 ///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
    435 ///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
    436 ///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
    437 ///    saturated to 0x8000. The results are packed into a 64-bit integer vector
    438 ///    of [4 x i16].
    439 ///
    440 /// \headerfile <x86intrin.h>
    441 ///
    442 /// This intrinsic corresponds to the <c> PADDSW </c> instruction.
    443 ///
    444 /// \param __m1
    445 ///    A 64-bit integer vector of [4 x i16].
    446 /// \param __m2
    447 ///    A 64-bit integer vector of [4 x i16].
    448 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
    449 ///    of both parameters.
    450 static __inline__ __m64 __DEFAULT_FN_ATTRS
    451 _mm_adds_pi16(__m64 __m1, __m64 __m2)
    452 {
    453     return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
    454 }
    455 
    456 /// \brief Adds each 8-bit unsigned integer element of the first 64-bit integer
    457 ///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
    458 ///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
    459 ///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
    460 ///    [8 x i8].
    461 ///
    462 /// \headerfile <x86intrin.h>
    463 ///
    464 /// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
    465 ///
    466 /// \param __m1
    467 ///    A 64-bit integer vector of [8 x i8].
    468 /// \param __m2
    469 ///    A 64-bit integer vector of [8 x i8].
    470 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
    471 ///    unsigned sums of both parameters.
    472 static __inline__ __m64 __DEFAULT_FN_ATTRS
    473 _mm_adds_pu8(__m64 __m1, __m64 __m2)
    474 {
    475     return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
    476 }
    477 
    478 /// \brief Adds each 16-bit unsigned integer element of the first 64-bit integer
    479 ///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
    480 ///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
    481 ///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
    482 ///    integer vector of [4 x i16].
    483 ///
    484 /// \headerfile <x86intrin.h>
    485 ///
    486 /// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
    487 ///
    488 /// \param __m1
    489 ///    A 64-bit integer vector of [4 x i16].
    490 /// \param __m2
    491 ///    A 64-bit integer vector of [4 x i16].
    492 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
    493 ///    unsigned sums of both parameters.
    494 static __inline__ __m64 __DEFAULT_FN_ATTRS
    495 _mm_adds_pu16(__m64 __m1, __m64 __m2)
    496 {
    497     return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
    498 }
    499 
    500 /// \brief Subtracts each 8-bit integer element of the second 64-bit integer
    501 ///    vector of [8 x i8] from the corresponding 8-bit integer element of the
    502 ///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
    503 ///    are packed into a 64-bit integer vector of [8 x i8].
    504 ///
    505 /// \headerfile <x86intrin.h>
    506 ///
    507 /// This intrinsic corresponds to the <c> PSUBB </c> instruction.
    508 ///
    509 /// \param __m1
    510 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
    511 /// \param __m2
    512 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
    513 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
    514 ///    both parameters.
    515 static __inline__ __m64 __DEFAULT_FN_ATTRS
    516 _mm_sub_pi8(__m64 __m1, __m64 __m2)
    517 {
    518     return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
    519 }
    520 
    521 /// \brief Subtracts each 16-bit integer element of the second 64-bit integer
    522 ///    vector of [4 x i16] from the corresponding 16-bit integer element of the
    523 ///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
    524 ///    results are packed into a 64-bit integer vector of [4 x i16].
    525 ///
    526 /// \headerfile <x86intrin.h>
    527 ///
    528 /// This intrinsic corresponds to the <c> PSUBW </c> instruction.
    529 ///
    530 /// \param __m1
    531 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
    532 /// \param __m2
    533 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
    534 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
    535 ///    both parameters.
    536 static __inline__ __m64 __DEFAULT_FN_ATTRS
    537 _mm_sub_pi16(__m64 __m1, __m64 __m2)
    538 {
    539     return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
    540 }
    541 
    542 /// \brief Subtracts each 32-bit integer element of the second 64-bit integer
    543 ///    vector of [2 x i32] from the corresponding 32-bit integer element of the
    544 ///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
    545 ///    results are packed into a 64-bit integer vector of [2 x i32].
    546 ///
    547 /// \headerfile <x86intrin.h>
    548 ///
    549 /// This intrinsic corresponds to the <c> PSUBD </c> instruction.
    550 ///
    551 /// \param __m1
    552 ///    A 64-bit integer vector of [2 x i32] containing the minuends.
    553 /// \param __m2
    554 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
    555 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
    556 ///    both parameters.
    557 static __inline__ __m64 __DEFAULT_FN_ATTRS
    558 _mm_sub_pi32(__m64 __m1, __m64 __m2)
    559 {
    560     return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
    561 }
    562 
    563 /// \brief Subtracts each 8-bit signed integer element of the second 64-bit
    564 ///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
    565 ///    element of the first 64-bit integer vector of [8 x i8]. Positive results
    566 ///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
    567 ///    are saturated to 0x80. The results are packed into a 64-bit integer
    568 ///    vector of [8 x i8].
    569 ///
    570 /// \headerfile <x86intrin.h>
    571 ///
    572 /// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
    573 ///
    574 /// \param __m1
    575 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
    576 /// \param __m2
    577 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
    578 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
    579 ///    differences of both parameters.
    580 static __inline__ __m64 __DEFAULT_FN_ATTRS
    581 _mm_subs_pi8(__m64 __m1, __m64 __m2)
    582 {
    583     return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
    584 }
    585 
    586 /// \brief Subtracts each 16-bit signed integer element of the second 64-bit
    587 ///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
    588 ///    element of the first 64-bit integer vector of [4 x i16]. Positive results
    589 ///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
    590 ///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
    591 ///    integer vector of [4 x i16].
    592 ///
    593 /// \headerfile <x86intrin.h>
    594 ///
    595 /// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
    596 ///
    597 /// \param __m1
    598 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
    599 /// \param __m2
    600 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
    601 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
    602 ///    differences of both parameters.
    603 static __inline__ __m64 __DEFAULT_FN_ATTRS
    604 _mm_subs_pi16(__m64 __m1, __m64 __m2)
    605 {
    606     return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
    607 }
    608 
    609 /// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit
    610 ///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
    611 ///    element of the first 64-bit integer vector of [8 x i8].
    612 ///
    613 ///    If an element of the first vector is less than the corresponding element
    614 ///    of the second vector, the result is saturated to 0. The results are
    615 ///    packed into a 64-bit integer vector of [8 x i8].
    616 ///
    617 /// \headerfile <x86intrin.h>
    618 ///
    619 /// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
    620 ///
    621 /// \param __m1
    622 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
    623 /// \param __m2
    624 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
    625 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
    626 ///    differences of both parameters.
    627 static __inline__ __m64 __DEFAULT_FN_ATTRS
    628 _mm_subs_pu8(__m64 __m1, __m64 __m2)
    629 {
    630     return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
    631 }
    632 
    633 /// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit
    634 ///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
    635 ///    integer element of the first 64-bit integer vector of [4 x i16].
    636 ///
    637 ///    If an element of the first vector is less than the corresponding element
    638 ///    of the second vector, the result is saturated to 0. The results are
    639 ///    packed into a 64-bit integer vector of [4 x i16].
    640 ///
    641 /// \headerfile <x86intrin.h>
    642 ///
    643 /// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
    644 ///
    645 /// \param __m1
    646 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
    647 /// \param __m2
    648 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
    649 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
    650 ///    differences of both parameters.
    651 static __inline__ __m64 __DEFAULT_FN_ATTRS
    652 _mm_subs_pu16(__m64 __m1, __m64 __m2)
    653 {
    654     return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
    655 }
    656 
    657 /// \brief Multiplies each 16-bit signed integer element of the first 64-bit
    658 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
    659 ///    element of the second 64-bit integer vector of [4 x i16] and get four
    660 ///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
    661 ///    The lower 32 bits of these two sums are packed into a 64-bit integer
    662 ///    vector of [2 x i32].
    663 ///
    664 ///    For example, bits [15:0] of both parameters are multiplied, bits [31:16]
    665 ///    of both parameters are multiplied, and the sum of both results is written
    666 ///    to bits [31:0] of the result.
    667 ///
    668 /// \headerfile <x86intrin.h>
    669 ///
    670 /// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
    671 ///
    672 /// \param __m1
    673 ///    A 64-bit integer vector of [4 x i16].
    674 /// \param __m2
    675 ///    A 64-bit integer vector of [4 x i16].
    676 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
    677 ///    products of both parameters.
    678 static __inline__ __m64 __DEFAULT_FN_ATTRS
    679 _mm_madd_pi16(__m64 __m1, __m64 __m2)
    680 {
    681     return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
    682 }
    683 
    684 /// \brief Multiplies each 16-bit signed integer element of the first 64-bit
    685 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
    686 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
    687 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
    688 ///
    689 /// \headerfile <x86intrin.h>
    690 ///
    691 /// This intrinsic corresponds to the <c> PMULHW </c> instruction.
    692 ///
    693 /// \param __m1
    694 ///    A 64-bit integer vector of [4 x i16].
    695 /// \param __m2
    696 ///    A 64-bit integer vector of [4 x i16].
    697 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
    698 ///    of the products of both parameters.
    699 static __inline__ __m64 __DEFAULT_FN_ATTRS
    700 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
    701 {
    702     return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
    703 }
    704 
    705 /// \brief Multiplies each 16-bit signed integer element of the first 64-bit
    706 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
    707 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
    708 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
    709 ///
    710 /// \headerfile <x86intrin.h>
    711 ///
    712 /// This intrinsic corresponds to the <c> PMULLW </c> instruction.
    713 ///
    714 /// \param __m1
    715 ///    A 64-bit integer vector of [4 x i16].
    716 /// \param __m2
    717 ///    A 64-bit integer vector of [4 x i16].
    718 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
    719 ///    of the products of both parameters.
    720 static __inline__ __m64 __DEFAULT_FN_ATTRS
    721 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
    722 {
    723     return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
    724 }
    725 
    726 /// \brief Left-shifts each 16-bit signed integer element of the first
    727 ///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
    728 ///    of bits specified by the second parameter, which is a 64-bit integer. The
    729 ///    lower 16 bits of the results are packed into a 64-bit integer vector of
    730 ///    [4 x i16].
    731 ///
    732 /// \headerfile <x86intrin.h>
    733 ///
    734 /// This intrinsic corresponds to the <c> PSLLW </c> instruction.
    735 ///
    736 /// \param __m
    737 ///    A 64-bit integer vector of [4 x i16].
    738 /// \param __count
    739 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    740 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
    741 ///    values. If \a __count is greater or equal to 16, the result is set to all
    742 ///    0.
    743 static __inline__ __m64 __DEFAULT_FN_ATTRS
    744 _mm_sll_pi16(__m64 __m, __m64 __count)
    745 {
    746     return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
    747 }
    748 
    749 /// \brief Left-shifts each 16-bit signed integer element of a 64-bit integer
    750 ///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
    751 ///    The lower 16 bits of the results are packed into a 64-bit integer vector
    752 ///    of [4 x i16].
    753 ///
    754 /// \headerfile <x86intrin.h>
    755 ///
    756 /// This intrinsic corresponds to the <c> PSLLW </c> instruction.
    757 ///
    758 /// \param __m
    759 ///    A 64-bit integer vector of [4 x i16].
    760 /// \param __count
    761 ///    A 32-bit integer value.
    762 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
    763 ///    values. If \a __count is greater or equal to 16, the result is set to all
    764 ///    0.
    765 static __inline__ __m64 __DEFAULT_FN_ATTRS
    766 _mm_slli_pi16(__m64 __m, int __count)
    767 {
    768     return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
    769 }
    770 
    771 /// \brief Left-shifts each 32-bit signed integer element of the first
    772 ///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
    773 ///    of bits specified by the second parameter, which is a 64-bit integer. The
    774 ///    lower 32 bits of the results are packed into a 64-bit integer vector of
    775 ///    [2 x i32].
    776 ///
    777 /// \headerfile <x86intrin.h>
    778 ///
    779 /// This intrinsic corresponds to the <c> PSLLD </c> instruction.
    780 ///
    781 /// \param __m
    782 ///    A 64-bit integer vector of [2 x i32].
    783 /// \param __count
    784 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    785 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
    786 ///    values. If \a __count is greater or equal to 32, the result is set to all
    787 ///    0.
    788 static __inline__ __m64 __DEFAULT_FN_ATTRS
    789 _mm_sll_pi32(__m64 __m, __m64 __count)
    790 {
    791     return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
    792 }
    793 
    794 /// \brief Left-shifts each 32-bit signed integer element of a 64-bit integer
    795 ///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
    796 ///    The lower 32 bits of the results are packed into a 64-bit integer vector
    797 ///    of [2 x i32].
    798 ///
    799 /// \headerfile <x86intrin.h>
    800 ///
    801 /// This intrinsic corresponds to the <c> PSLLD </c> instruction.
    802 ///
    803 /// \param __m
    804 ///    A 64-bit integer vector of [2 x i32].
    805 /// \param __count
    806 ///    A 32-bit integer value.
    807 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
    808 ///    values. If \a __count is greater or equal to 32, the result is set to all
    809 ///    0.
    810 static __inline__ __m64 __DEFAULT_FN_ATTRS
    811 _mm_slli_pi32(__m64 __m, int __count)
    812 {
    813     return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
    814 }
    815 
    816 /// \brief Left-shifts the first 64-bit integer parameter by the number of bits
    817 ///    specified by the second 64-bit integer parameter. The lower 64 bits of
    818 ///    result are returned.
    819 ///
    820 /// \headerfile <x86intrin.h>
    821 ///
    822 /// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
    823 ///
    824 /// \param __m
    825 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    826 /// \param __count
    827 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    828 /// \returns A 64-bit integer vector containing the left-shifted value. If
    829 ///     \a __count is greater or equal to 64, the result is set to 0.
    830 static __inline__ __m64 __DEFAULT_FN_ATTRS
    831 _mm_sll_si64(__m64 __m, __m64 __count)
    832 {
    833     return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
    834 }
    835 
    836 /// \brief Left-shifts the first parameter, which is a 64-bit integer, by the
    837 ///    number of bits specified by the second parameter, which is a 32-bit
    838 ///    integer. The lower 64 bits of result are returned.
    839 ///
    840 /// \headerfile <x86intrin.h>
    841 ///
    842 /// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
    843 ///
    844 /// \param __m
    845 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    846 /// \param __count
    847 ///    A 32-bit integer value.
    848 /// \returns A 64-bit integer vector containing the left-shifted value. If
    849 ///     \a __count is greater or equal to 64, the result is set to 0.
    850 static __inline__ __m64 __DEFAULT_FN_ATTRS
    851 _mm_slli_si64(__m64 __m, int __count)
    852 {
    853     return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
    854 }
    855 
    856 /// \brief Right-shifts each 16-bit integer element of the first parameter,
    857 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
    858 ///    specified by the second parameter, which is a 64-bit integer.
    859 ///
    860 ///    High-order bits are filled with the sign bit of the initial value of each
    861 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
    862 ///    vector of [4 x i16].
    863 ///
    864 /// \headerfile <x86intrin.h>
    865 ///
    866 /// This intrinsic corresponds to the <c> PSRAW </c> instruction.
    867 ///
    868 /// \param __m
    869 ///    A 64-bit integer vector of [4 x i16].
    870 /// \param __count
    871 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    872 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
    873 ///    values.
    874 static __inline__ __m64 __DEFAULT_FN_ATTRS
    875 _mm_sra_pi16(__m64 __m, __m64 __count)
    876 {
    877     return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
    878 }
    879 
    880 /// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
    881 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
    882 ///
    883 ///    High-order bits are filled with the sign bit of the initial value of each
    884 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
    885 ///    vector of [4 x i16].
    886 ///
    887 /// \headerfile <x86intrin.h>
    888 ///
    889 /// This intrinsic corresponds to the <c> PSRAW </c> instruction.
    890 ///
    891 /// \param __m
    892 ///    A 64-bit integer vector of [4 x i16].
    893 /// \param __count
    894 ///    A 32-bit integer value.
    895 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
    896 ///    values.
    897 static __inline__ __m64 __DEFAULT_FN_ATTRS
    898 _mm_srai_pi16(__m64 __m, int __count)
    899 {
    900     return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
    901 }
    902 
    903 /// \brief Right-shifts each 32-bit integer element of the first parameter,
    904 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
    905 ///    specified by the second parameter, which is a 64-bit integer.
    906 ///
    907 ///    High-order bits are filled with the sign bit of the initial value of each
    908 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
    909 ///    vector of [2 x i32].
    910 ///
    911 /// \headerfile <x86intrin.h>
    912 ///
    913 /// This intrinsic corresponds to the <c> PSRAD </c> instruction.
    914 ///
    915 /// \param __m
    916 ///    A 64-bit integer vector of [2 x i32].
    917 /// \param __count
    918 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    919 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
    920 ///    values.
    921 static __inline__ __m64 __DEFAULT_FN_ATTRS
    922 _mm_sra_pi32(__m64 __m, __m64 __count)
    923 {
    924     return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
    925 }
    926 
    927 /// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
    928 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
    929 ///
    930 ///    High-order bits are filled with the sign bit of the initial value of each
    931 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
    932 ///    vector of [2 x i32].
    933 ///
    934 /// \headerfile <x86intrin.h>
    935 ///
    936 /// This intrinsic corresponds to the <c> PSRAD </c> instruction.
    937 ///
    938 /// \param __m
    939 ///    A 64-bit integer vector of [2 x i32].
    940 /// \param __count
    941 ///    A 32-bit integer value.
    942 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
    943 ///    values.
    944 static __inline__ __m64 __DEFAULT_FN_ATTRS
    945 _mm_srai_pi32(__m64 __m, int __count)
    946 {
    947     return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
    948 }
    949 
    950 /// \brief Right-shifts each 16-bit integer element of the first parameter,
    951 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
    952 ///    specified by the second parameter, which is a 64-bit integer.
    953 ///
    954 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
    955 ///    integer vector of [4 x i16].
    956 ///
    957 /// \headerfile <x86intrin.h>
    958 ///
    959 /// This intrinsic corresponds to the <c> PSRLW </c> instruction.
    960 ///
    961 /// \param __m
    962 ///    A 64-bit integer vector of [4 x i16].
    963 /// \param __count
    964 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    965 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
    966 ///    values.
    967 static __inline__ __m64 __DEFAULT_FN_ATTRS
    968 _mm_srl_pi16(__m64 __m, __m64 __count)
    969 {
    970     return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
    971 }
    972 
    973 /// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
    974 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
    975 ///
    976 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
    977 ///    integer vector of [4 x i16].
    978 ///
    979 /// \headerfile <x86intrin.h>
    980 ///
    981 /// This intrinsic corresponds to the <c> PSRLW </c> instruction.
    982 ///
    983 /// \param __m
    984 ///    A 64-bit integer vector of [4 x i16].
    985 /// \param __count
    986 ///    A 32-bit integer value.
    987 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
    988 ///    values.
    989 static __inline__ __m64 __DEFAULT_FN_ATTRS
    990 _mm_srli_pi16(__m64 __m, int __count)
    991 {
    992     return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
    993 }
    994 
    995 /// \brief Right-shifts each 32-bit integer element of the first parameter,
    996 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
    997 ///    specified by the second parameter, which is a 64-bit integer.
    998 ///
    999 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
   1000 ///    integer vector of [2 x i32].
   1001 ///
   1002 /// \headerfile <x86intrin.h>
   1003 ///
   1004 /// This intrinsic corresponds to the <c> PSRLD </c> instruction.
   1005 ///
   1006 /// \param __m
   1007 ///    A 64-bit integer vector of [2 x i32].
   1008 /// \param __count
   1009 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
   1010 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
   1011 ///    values.
   1012 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1013 _mm_srl_pi32(__m64 __m, __m64 __count)
   1014 {
   1015     return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
   1016 }
   1017 
   1018 /// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
   1019 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
   1020 ///
   1021 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
   1022 ///    integer vector of [2 x i32].
   1023 ///
   1024 /// \headerfile <x86intrin.h>
   1025 ///
   1026 /// This intrinsic corresponds to the <c> PSRLD </c> instruction.
   1027 ///
   1028 /// \param __m
   1029 ///    A 64-bit integer vector of [2 x i32].
   1030 /// \param __count
   1031 ///    A 32-bit integer value.
   1032 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
   1033 ///    values.
   1034 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1035 _mm_srli_pi32(__m64 __m, int __count)
   1036 {
   1037     return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
   1038 }
   1039 
   1040 /// \brief Right-shifts the first 64-bit integer parameter by the number of bits
   1041 ///    specified by the second 64-bit integer parameter.
   1042 ///
   1043 ///    High-order bits are cleared.
   1044 ///
   1045 /// \headerfile <x86intrin.h>
   1046 ///
   1047 /// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
   1048 ///
   1049 /// \param __m
   1050 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
   1051 /// \param __count
   1052 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
   1053 /// \returns A 64-bit integer vector containing the right-shifted value.
   1054 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1055 _mm_srl_si64(__m64 __m, __m64 __count)
   1056 {
   1057     return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
   1058 }
   1059 
   1060 /// \brief Right-shifts the first parameter, which is a 64-bit integer, by the
   1061 ///    number of bits specified by the second parameter, which is a 32-bit
   1062 ///    integer.
   1063 ///
   1064 ///    High-order bits are cleared.
   1065 ///
   1066 /// \headerfile <x86intrin.h>
   1067 ///
   1068 /// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
   1069 ///
   1070 /// \param __m
   1071 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
   1072 /// \param __count
   1073 ///    A 32-bit integer value.
   1074 /// \returns A 64-bit integer vector containing the right-shifted value.
   1075 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1076 _mm_srli_si64(__m64 __m, int __count)
   1077 {
   1078     return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
   1079 }
   1080 
   1081 /// \brief Performs a bitwise AND of two 64-bit integer vectors.
   1082 ///
   1083 /// \headerfile <x86intrin.h>
   1084 ///
   1085 /// This intrinsic corresponds to the <c> PAND </c> instruction.
   1086 ///
   1087 /// \param __m1
   1088 ///    A 64-bit integer vector.
   1089 /// \param __m2
   1090 ///    A 64-bit integer vector.
   1091 /// \returns A 64-bit integer vector containing the bitwise AND of both
   1092 ///    parameters.
   1093 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1094 _mm_and_si64(__m64 __m1, __m64 __m2)
   1095 {
   1096     return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
   1097 }
   1098 
   1099 /// \brief Performs a bitwise NOT of the first 64-bit integer vector, and then
   1100 ///    performs a bitwise AND of the intermediate result and the second 64-bit
   1101 ///    integer vector.
   1102 ///
   1103 /// \headerfile <x86intrin.h>
   1104 ///
   1105 /// This intrinsic corresponds to the <c> PANDN </c> instruction.
   1106 ///
   1107 /// \param __m1
   1108 ///    A 64-bit integer vector. The one's complement of this parameter is used
   1109 ///    in the bitwise AND.
   1110 /// \param __m2
   1111 ///    A 64-bit integer vector.
   1112 /// \returns A 64-bit integer vector containing the bitwise AND of the second
   1113 ///    parameter and the one's complement of the first parameter.
   1114 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1115 _mm_andnot_si64(__m64 __m1, __m64 __m2)
   1116 {
   1117     return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
   1118 }
   1119 
   1120 /// \brief Performs a bitwise OR of two 64-bit integer vectors.
   1121 ///
   1122 /// \headerfile <x86intrin.h>
   1123 ///
   1124 /// This intrinsic corresponds to the <c> POR </c> instruction.
   1125 ///
   1126 /// \param __m1
   1127 ///    A 64-bit integer vector.
   1128 /// \param __m2
   1129 ///    A 64-bit integer vector.
   1130 /// \returns A 64-bit integer vector containing the bitwise OR of both
   1131 ///    parameters.
   1132 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1133 _mm_or_si64(__m64 __m1, __m64 __m2)
   1134 {
   1135     return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
   1136 }
   1137 
   1138 /// \brief Performs a bitwise exclusive OR of two 64-bit integer vectors.
   1139 ///
   1140 /// \headerfile <x86intrin.h>
   1141 ///
   1142 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
   1143 ///
   1144 /// \param __m1
   1145 ///    A 64-bit integer vector.
   1146 /// \param __m2
   1147 ///    A 64-bit integer vector.
   1148 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
   1149 ///    parameters.
   1150 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1151 _mm_xor_si64(__m64 __m1, __m64 __m2)
   1152 {
   1153     return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
   1154 }
   1155 
   1156 /// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
   1157 ///    [8 x i8] to determine if the element of the first vector is equal to the
   1158 ///    corresponding element of the second vector.
   1159 ///
   1160 ///    The comparison yields 0 for false, 0xFF for true.
   1161 ///
   1162 /// \headerfile <x86intrin.h>
   1163 ///
   1164 /// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
   1165 ///
   1166 /// \param __m1
   1167 ///    A 64-bit integer vector of [8 x i8].
   1168 /// \param __m2
   1169 ///    A 64-bit integer vector of [8 x i8].
   1170 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
   1171 ///    results.
   1172 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1173 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
   1174 {
   1175     return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
   1176 }
   1177 
   1178 /// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
   1179 ///    [4 x i16] to determine if the element of the first vector is equal to the
   1180 ///    corresponding element of the second vector.
   1181 ///
   1182 ///    The comparison yields 0 for false, 0xFFFF for true.
   1183 ///
   1184 /// \headerfile <x86intrin.h>
   1185 ///
   1186 /// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
   1187 ///
   1188 /// \param __m1
   1189 ///    A 64-bit integer vector of [4 x i16].
   1190 /// \param __m2
   1191 ///    A 64-bit integer vector of [4 x i16].
   1192 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
   1193 ///    results.
   1194 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1195 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
   1196 {
   1197     return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
   1198 }
   1199 
   1200 /// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
   1201 ///    [2 x i32] to determine if the element of the first vector is equal to the
   1202 ///    corresponding element of the second vector.
   1203 ///
   1204 ///    The comparison yields 0 for false, 0xFFFFFFFF for true.
   1205 ///
   1206 /// \headerfile <x86intrin.h>
   1207 ///
   1208 /// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
   1209 ///
   1210 /// \param __m1
   1211 ///    A 64-bit integer vector of [2 x i32].
   1212 /// \param __m2
   1213 ///    A 64-bit integer vector of [2 x i32].
   1214 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
   1215 ///    results.
   1216 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1217 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
   1218 {
   1219     return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
   1220 }
   1221 
   1222 /// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
   1223 ///    [8 x i8] to determine if the element of the first vector is greater than
   1224 ///    the corresponding element of the second vector.
   1225 ///
   1226 ///    The comparison yields 0 for false, 0xFF for true.
   1227 ///
   1228 /// \headerfile <x86intrin.h>
   1229 ///
   1230 /// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
   1231 ///
   1232 /// \param __m1
   1233 ///    A 64-bit integer vector of [8 x i8].
   1234 /// \param __m2
   1235 ///    A 64-bit integer vector of [8 x i8].
   1236 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
   1237 ///    results.
   1238 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1239 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
   1240 {
   1241     return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
   1242 }
   1243 
   1244 /// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
   1245 ///    [4 x i16] to determine if the element of the first vector is greater than
   1246 ///    the corresponding element of the second vector.
   1247 ///
   1248 ///    The comparison yields 0 for false, 0xFFFF for true.
   1249 ///
   1250 /// \headerfile <x86intrin.h>
   1251 ///
   1252 /// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
   1253 ///
   1254 /// \param __m1
   1255 ///    A 64-bit integer vector of [4 x i16].
   1256 /// \param __m2
   1257 ///    A 64-bit integer vector of [4 x i16].
   1258 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
   1259 ///    results.
   1260 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1261 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
   1262 {
   1263     return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
   1264 }
   1265 
   1266 /// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
   1267 ///    [2 x i32] to determine if the element of the first vector is greater than
   1268 ///    the corresponding element of the second vector.
   1269 ///
   1270 ///    The comparison yields 0 for false, 0xFFFFFFFF for true.
   1271 ///
   1272 /// \headerfile <x86intrin.h>
   1273 ///
   1274 /// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
   1275 ///
   1276 /// \param __m1
   1277 ///    A 64-bit integer vector of [2 x i32].
   1278 /// \param __m2
   1279 ///    A 64-bit integer vector of [2 x i32].
   1280 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
   1281 ///    results.
   1282 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1283 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
   1284 {
   1285     return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
   1286 }
   1287 
   1288 /// \brief Constructs a 64-bit integer vector initialized to zero.
   1289 ///
   1290 /// \headerfile <x86intrin.h>
   1291 ///
   1292 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
   1293 ///
   1294 /// \returns An initialized 64-bit integer vector with all elements set to zero.
   1295 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1296 _mm_setzero_si64(void)
   1297 {
   1298     return (__m64){ 0LL };
   1299 }
   1300 
   1301 /// \brief Constructs a 64-bit integer vector initialized with the specified
   1302 ///    32-bit integer values.
   1303 ///
   1304 /// \headerfile <x86intrin.h>
   1305 ///
   1306 /// This intrinsic is a utility function and does not correspond to a specific
   1307 ///    instruction.
   1308 ///
   1309 /// \param __i1
   1310 ///    A 32-bit integer value used to initialize the upper 32 bits of the
   1311 ///    result.
   1312 /// \param __i0
   1313 ///    A 32-bit integer value used to initialize the lower 32 bits of the
   1314 ///    result.
   1315 /// \returns An initialized 64-bit integer vector.
   1316 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1317 _mm_set_pi32(int __i1, int __i0)
   1318 {
   1319     return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
   1320 }
   1321 
   1322 /// \brief Constructs a 64-bit integer vector initialized with the specified
   1323 ///    16-bit integer values.
   1324 ///
   1325 /// \headerfile <x86intrin.h>
   1326 ///
   1327 /// This intrinsic is a utility function and does not correspond to a specific
   1328 ///    instruction.
   1329 ///
   1330 /// \param __s3
   1331 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
   1332 /// \param __s2
   1333 ///    A 16-bit integer value used to initialize bits [47:32] of the result.
   1334 /// \param __s1
   1335 ///    A 16-bit integer value used to initialize bits [31:16] of the result.
   1336 /// \param __s0
   1337 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
   1338 /// \returns An initialized 64-bit integer vector.
   1339 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1340 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
   1341 {
   1342     return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
   1343 }
   1344 
   1345 /// \brief Constructs a 64-bit integer vector initialized with the specified
   1346 ///    8-bit integer values.
   1347 ///
   1348 /// \headerfile <x86intrin.h>
   1349 ///
   1350 /// This intrinsic is a utility function and does not correspond to a specific
   1351 ///    instruction.
   1352 ///
   1353 /// \param __b7
   1354 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
   1355 /// \param __b6
   1356 ///    An 8-bit integer value used to initialize bits [55:48] of the result.
   1357 /// \param __b5
   1358 ///    An 8-bit integer value used to initialize bits [47:40] of the result.
   1359 /// \param __b4
   1360 ///    An 8-bit integer value used to initialize bits [39:32] of the result.
   1361 /// \param __b3
   1362 ///    An 8-bit integer value used to initialize bits [31:24] of the result.
   1363 /// \param __b2
   1364 ///    An 8-bit integer value used to initialize bits [23:16] of the result.
   1365 /// \param __b1
   1366 ///    An 8-bit integer value used to initialize bits [15:8] of the result.
   1367 /// \param __b0
   1368 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
   1369 /// \returns An initialized 64-bit integer vector.
   1370 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1371 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
   1372             char __b1, char __b0)
   1373 {
   1374     return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
   1375                                                __b4, __b5, __b6, __b7);
   1376 }
   1377 
   1378 /// \brief Constructs a 64-bit integer vector of [2 x i32], with each of the
   1379 ///    32-bit integer vector elements set to the specified 32-bit integer
   1380 ///    value.
   1381 ///
   1382 /// \headerfile <x86intrin.h>
   1383 ///
   1384 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
   1385 ///
   1386 /// \param __i
   1387 ///    A 32-bit integer value used to initialize each vector element of the
   1388 ///    result.
   1389 /// \returns An initialized 64-bit integer vector of [2 x i32].
   1390 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1391 _mm_set1_pi32(int __i)
   1392 {
   1393     return _mm_set_pi32(__i, __i);
   1394 }
   1395 
   1396 /// \brief Constructs a 64-bit integer vector of [4 x i16], with each of the
   1397 ///    16-bit integer vector elements set to the specified 16-bit integer
   1398 ///    value.
   1399 ///
   1400 /// \headerfile <x86intrin.h>
   1401 ///
   1402 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
   1403 ///
   1404 /// \param __w
   1405 ///    A 16-bit integer value used to initialize each vector element of the
   1406 ///    result.
   1407 /// \returns An initialized 64-bit integer vector of [4 x i16].
   1408 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1409 _mm_set1_pi16(short __w)
   1410 {
   1411     return _mm_set_pi16(__w, __w, __w, __w);
   1412 }
   1413 
   1414 /// \brief Constructs a 64-bit integer vector of [8 x i8], with each of the
   1415 ///    8-bit integer vector elements set to the specified 8-bit integer value.
   1416 ///
   1417 /// \headerfile <x86intrin.h>
   1418 ///
   1419 /// This intrinsic corresponds to the <c> VPUNPCKLBW + VPSHUFLW / PUNPCKLBW +
   1420 ///    PSHUFLW </c> instruction.
   1421 ///
   1422 /// \param __b
   1423 ///    An 8-bit integer value used to initialize each vector element of the
   1424 ///    result.
   1425 /// \returns An initialized 64-bit integer vector of [8 x i8].
   1426 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1427 _mm_set1_pi8(char __b)
   1428 {
   1429     return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
   1430 }
   1431 
   1432 /// \brief Constructs a 64-bit integer vector, initialized in reverse order with
   1433 ///    the specified 32-bit integer values.
   1434 ///
   1435 /// \headerfile <x86intrin.h>
   1436 ///
   1437 /// This intrinsic is a utility function and does not correspond to a specific
   1438 ///    instruction.
   1439 ///
   1440 /// \param __i0
   1441 ///    A 32-bit integer value used to initialize the lower 32 bits of the
   1442 ///    result.
   1443 /// \param __i1
   1444 ///    A 32-bit integer value used to initialize the upper 32 bits of the
   1445 ///    result.
   1446 /// \returns An initialized 64-bit integer vector.
   1447 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1448 _mm_setr_pi32(int __i0, int __i1)
   1449 {
   1450     return _mm_set_pi32(__i1, __i0);
   1451 }
   1452 
   1453 /// \brief Constructs a 64-bit integer vector, initialized in reverse order with
   1454 ///    the specified 16-bit integer values.
   1455 ///
   1456 /// \headerfile <x86intrin.h>
   1457 ///
   1458 /// This intrinsic is a utility function and does not correspond to a specific
   1459 ///    instruction.
   1460 ///
   1461 /// \param __w0
   1462 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
   1463 /// \param __w1
   1464 ///    A 16-bit integer value used to initialize bits [31:16] of the result.
   1465 /// \param __w2
   1466 ///    A 16-bit integer value used to initialize bits [47:32] of the result.
   1467 /// \param __w3
   1468 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
   1469 /// \returns An initialized 64-bit integer vector.
   1470 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1471 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
   1472 {
   1473     return _mm_set_pi16(__w3, __w2, __w1, __w0);
   1474 }
   1475 
   1476 /// \brief Constructs a 64-bit integer vector, initialized in reverse order with
   1477 ///    the specified 8-bit integer values.
   1478 ///
   1479 /// \headerfile <x86intrin.h>
   1480 ///
   1481 /// This intrinsic is a utility function and does not correspond to a specific
   1482 ///    instruction.
   1483 ///
   1484 /// \param __b0
   1485 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
   1486 /// \param __b1
   1487 ///    An 8-bit integer value used to initialize bits [15:8] of the result.
   1488 /// \param __b2
   1489 ///    An 8-bit integer value used to initialize bits [23:16] of the result.
   1490 /// \param __b3
   1491 ///    An 8-bit integer value used to initialize bits [31:24] of the result.
   1492 /// \param __b4
   1493 ///    An 8-bit integer value used to initialize bits [39:32] of the result.
   1494 /// \param __b5
   1495 ///    An 8-bit integer value used to initialize bits [47:40] of the result.
   1496 /// \param __b6
   1497 ///    An 8-bit integer value used to initialize bits [55:48] of the result.
   1498 /// \param __b7
   1499 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
   1500 /// \returns An initialized 64-bit integer vector.
   1501 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1502 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
   1503              char __b6, char __b7)
   1504 {
   1505     return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1506 }
   1507 
   1508 #undef __DEFAULT_FN_ATTRS
   1509 
   1510 /* Aliases for compatibility. */
   1511 #define _m_empty _mm_empty
   1512 #define _m_from_int _mm_cvtsi32_si64
   1513 #define _m_from_int64 _mm_cvtsi64_m64
   1514 #define _m_to_int _mm_cvtsi64_si32
   1515 #define _m_to_int64 _mm_cvtm64_si64
   1516 #define _m_packsswb _mm_packs_pi16
   1517 #define _m_packssdw _mm_packs_pi32
   1518 #define _m_packuswb _mm_packs_pu16
   1519 #define _m_punpckhbw _mm_unpackhi_pi8
   1520 #define _m_punpckhwd _mm_unpackhi_pi16
   1521 #define _m_punpckhdq _mm_unpackhi_pi32
   1522 #define _m_punpcklbw _mm_unpacklo_pi8
   1523 #define _m_punpcklwd _mm_unpacklo_pi16
   1524 #define _m_punpckldq _mm_unpacklo_pi32
   1525 #define _m_paddb _mm_add_pi8
   1526 #define _m_paddw _mm_add_pi16
   1527 #define _m_paddd _mm_add_pi32
   1528 #define _m_paddsb _mm_adds_pi8
   1529 #define _m_paddsw _mm_adds_pi16
   1530 #define _m_paddusb _mm_adds_pu8
   1531 #define _m_paddusw _mm_adds_pu16
   1532 #define _m_psubb _mm_sub_pi8
   1533 #define _m_psubw _mm_sub_pi16
   1534 #define _m_psubd _mm_sub_pi32
   1535 #define _m_psubsb _mm_subs_pi8
   1536 #define _m_psubsw _mm_subs_pi16
   1537 #define _m_psubusb _mm_subs_pu8
   1538 #define _m_psubusw _mm_subs_pu16
   1539 #define _m_pmaddwd _mm_madd_pi16
   1540 #define _m_pmulhw _mm_mulhi_pi16
   1541 #define _m_pmullw _mm_mullo_pi16
   1542 #define _m_psllw _mm_sll_pi16
   1543 #define _m_psllwi _mm_slli_pi16
   1544 #define _m_pslld _mm_sll_pi32
   1545 #define _m_pslldi _mm_slli_pi32
   1546 #define _m_psllq _mm_sll_si64
   1547 #define _m_psllqi _mm_slli_si64
   1548 #define _m_psraw _mm_sra_pi16
   1549 #define _m_psrawi _mm_srai_pi16
   1550 #define _m_psrad _mm_sra_pi32
   1551 #define _m_psradi _mm_srai_pi32
   1552 #define _m_psrlw _mm_srl_pi16
   1553 #define _m_psrlwi _mm_srli_pi16
   1554 #define _m_psrld _mm_srl_pi32
   1555 #define _m_psrldi _mm_srli_pi32
   1556 #define _m_psrlq _mm_srl_si64
   1557 #define _m_psrlqi _mm_srli_si64
   1558 #define _m_pand _mm_and_si64
   1559 #define _m_pandn _mm_andnot_si64
   1560 #define _m_por _mm_or_si64
   1561 #define _m_pxor _mm_xor_si64
   1562 #define _m_pcmpeqb _mm_cmpeq_pi8
   1563 #define _m_pcmpeqw _mm_cmpeq_pi16
   1564 #define _m_pcmpeqd _mm_cmpeq_pi32
   1565 #define _m_pcmpgtb _mm_cmpgt_pi8
   1566 #define _m_pcmpgtw _mm_cmpgt_pi16
   1567 #define _m_pcmpgtd _mm_cmpgt_pi32
   1568 
   1569 #endif /* __MMINTRIN_H */
   1570 
   1571