Home | History | Annotate | Download | only in Headers
      1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __TMMINTRIN_H
     25 #define __TMMINTRIN_H
     26 
     27 #include <pmmintrin.h>
     28 
     29 /* Define the default attributes for the functions in this file. */
     30 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
     31 
     32 /// \brief Computes the absolute value of each of the packed 8-bit signed
     33 ///    integers in the source operand and stores the 8-bit unsigned integer
     34 ///    results in the destination.
     35 ///
     36 /// \headerfile <x86intrin.h>
     37 ///
     38 /// This intrinsic corresponds to the \c PABSB instruction.
     39 ///
     40 /// \param __a
     41 ///    A 64-bit vector of [8 x i8].
     42 /// \returns A 64-bit integer vector containing the absolute values of the
     43 ///    elements in the operand.
     44 static __inline__ __m64 __DEFAULT_FN_ATTRS
     45 _mm_abs_pi8(__m64 __a)
     46 {
     47     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
     48 }
     49 
     50 /// \brief Computes the absolute value of each of the packed 8-bit signed
     51 ///    integers in the source operand and stores the 8-bit unsigned integer
     52 ///    results in the destination.
     53 ///
     54 /// \headerfile <x86intrin.h>
     55 ///
     56 /// This intrinsic corresponds to the \c VPABSB instruction.
     57 ///
     58 /// \param __a
     59 ///    A 128-bit vector of [16 x i8].
     60 /// \returns A 128-bit integer vector containing the absolute values of the
     61 ///    elements in the operand.
     62 static __inline__ __m128i __DEFAULT_FN_ATTRS
     63 _mm_abs_epi8(__m128i __a)
     64 {
     65     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
     66 }
     67 
     68 /// \brief Computes the absolute value of each of the packed 16-bit signed
     69 ///    integers in the source operand and stores the 16-bit unsigned integer
     70 ///    results in the destination.
     71 ///
     72 /// \headerfile <x86intrin.h>
     73 ///
     74 /// This intrinsic corresponds to the \c PABSW instruction.
     75 ///
     76 /// \param __a
     77 ///    A 64-bit vector of [4 x i16].
     78 /// \returns A 64-bit integer vector containing the absolute values of the
     79 ///    elements in the operand.
     80 static __inline__ __m64 __DEFAULT_FN_ATTRS
     81 _mm_abs_pi16(__m64 __a)
     82 {
     83     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
     84 }
     85 
     86 /// \brief Computes the absolute value of each of the packed 16-bit signed
     87 ///    integers in the source operand and stores the 16-bit unsigned integer
     88 ///    results in the destination.
     89 ///
     90 /// \headerfile <x86intrin.h>
     91 ///
     92 /// This intrinsic corresponds to the \c VPABSW instruction.
     93 ///
     94 /// \param __a
     95 ///    A 128-bit vector of [8 x i16].
     96 /// \returns A 128-bit integer vector containing the absolute values of the
     97 ///    elements in the operand.
     98 static __inline__ __m128i __DEFAULT_FN_ATTRS
     99 _mm_abs_epi16(__m128i __a)
    100 {
    101     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
    102 }
    103 
    104 /// \brief Computes the absolute value of each of the packed 32-bit signed
    105 ///    integers in the source operand and stores the 32-bit unsigned integer
    106 ///    results in the destination.
    107 ///
    108 /// \headerfile <x86intrin.h>
    109 ///
    110 /// This intrinsic corresponds to the \c PABSD instruction.
    111 ///
    112 /// \param __a
    113 ///    A 64-bit vector of [2 x i32].
    114 /// \returns A 64-bit integer vector containing the absolute values of the
    115 ///    elements in the operand.
    116 static __inline__ __m64 __DEFAULT_FN_ATTRS
    117 _mm_abs_pi32(__m64 __a)
    118 {
    119     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
    120 }
    121 
    122 /// \brief Computes the absolute value of each of the packed 32-bit signed
    123 ///    integers in the source operand and stores the 32-bit unsigned integer
    124 ///    results in the destination.
    125 ///
    126 /// \headerfile <x86intrin.h>
    127 ///
    128 /// This intrinsic corresponds to the \c VPABSD instruction.
    129 ///
    130 /// \param __a
    131 ///    A 128-bit vector of [4 x i32].
    132 /// \returns A 128-bit integer vector containing the absolute values of the
    133 ///    elements in the operand.
    134 static __inline__ __m128i __DEFAULT_FN_ATTRS
    135 _mm_abs_epi32(__m128i __a)
    136 {
    137     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
    138 }
    139 
    140 /// \brief Concatenates the two 128-bit integer vector operands, and
    141 ///    right-shifts the result by the number of bytes specified in the immediate
    142 ///    operand.
    143 ///
    144 /// \headerfile <x86intrin.h>
    145 ///
    146 /// \code
    147 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
    148 /// \endcode
    149 ///
    150 /// This intrinsic corresponds to the \c PALIGNR instruction.
    151 ///
    152 /// \param a
    153 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
    154 /// \param b
    155 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
    156 /// \param n
    157 ///    An immediate operand specifying how many bytes to right-shift the result.
    158 /// \returns A 128-bit integer vector containing the concatenated right-shifted
    159 ///    value.
    160 #define _mm_alignr_epi8(a, b, n) __extension__ ({ \
    161   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
    162                                      (__v16qi)(__m128i)(b), (n)); })
    163 
    164 /// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
    165 ///    the result by the number of bytes specified in the immediate operand.
    166 ///
    167 /// \headerfile <x86intrin.h>
    168 ///
    169 /// \code
    170 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
    171 /// \endcode
    172 ///
    173 /// This intrinsic corresponds to the \c PALIGNR instruction.
    174 ///
    175 /// \param a
    176 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
    177 /// \param b
    178 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
    179 /// \param n
    180 ///    An immediate operand specifying how many bytes to right-shift the result.
    181 /// \returns A 64-bit integer vector containing the concatenated right-shifted
    182 ///    value.
    183 #define _mm_alignr_pi8(a, b, n) __extension__ ({ \
    184   (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
    185 
    186 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    187 ///    128-bit vectors of [8 x i16].
    188 ///
    189 /// \headerfile <x86intrin.h>
    190 ///
    191 /// This intrinsic corresponds to the \c VPHADDW instruction.
    192 ///
    193 /// \param __a
    194 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    195 ///    horizontal sums of the values are stored in the lower bits of the
    196 ///    destination.
    197 /// \param __b
    198 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    199 ///    horizontal sums of the values are stored in the upper bits of the
    200 ///    destination.
    201 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
    202 ///    both operands.
    203 static __inline__ __m128i __DEFAULT_FN_ATTRS
    204 _mm_hadd_epi16(__m128i __a, __m128i __b)
    205 {
    206     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
    207 }
    208 
    209 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    210 ///    128-bit vectors of [4 x i32].
    211 ///
    212 /// \headerfile <x86intrin.h>
    213 ///
    214 /// This intrinsic corresponds to the \c VPHADDD instruction.
    215 ///
    216 /// \param __a
    217 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    218 ///    horizontal sums of the values are stored in the lower bits of the
    219 ///    destination.
    220 /// \param __b
    221 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    222 ///    horizontal sums of the values are stored in the upper bits of the
    223 ///    destination.
    224 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
    225 ///    both operands.
    226 static __inline__ __m128i __DEFAULT_FN_ATTRS
    227 _mm_hadd_epi32(__m128i __a, __m128i __b)
    228 {
    229     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
    230 }
    231 
    232 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    233 ///    64-bit vectors of [4 x i16].
    234 ///
    235 /// \headerfile <x86intrin.h>
    236 ///
    237 /// This intrinsic corresponds to the \c PHADDW instruction.
    238 ///
    239 /// \param __a
    240 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    241 ///    horizontal sums of the values are stored in the lower bits of the
    242 ///    destination.
    243 /// \param __b
    244 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    245 ///    horizontal sums of the values are stored in the upper bits of the
    246 ///    destination.
    247 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
    248 ///    operands.
    249 static __inline__ __m64 __DEFAULT_FN_ATTRS
    250 _mm_hadd_pi16(__m64 __a, __m64 __b)
    251 {
    252     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
    253 }
    254 
    255 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    256 ///    64-bit vectors of [2 x i32].
    257 ///
    258 /// \headerfile <x86intrin.h>
    259 ///
    260 /// This intrinsic corresponds to the \c PHADDD instruction.
    261 ///
    262 /// \param __a
    263 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    264 ///    horizontal sums of the values are stored in the lower bits of the
    265 ///    destination.
    266 /// \param __b
    267 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    268 ///    horizontal sums of the values are stored in the upper bits of the
    269 ///    destination.
    270 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
    271 ///    operands.
    272 static __inline__ __m64 __DEFAULT_FN_ATTRS
    273 _mm_hadd_pi32(__m64 __a, __m64 __b)
    274 {
    275     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
    276 }
    277 
    278 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    279 ///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
    280 ///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
    281 ///
    282 /// \headerfile <x86intrin.h>
    283 ///
    284 /// This intrinsic corresponds to the \c VPHADDSW instruction.
    285 ///
    286 /// \param __a
    287 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    288 ///    horizontal sums of the values are stored in the lower bits of the
    289 ///    destination.
    290 /// \param __b
    291 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    292 ///    horizontal sums of the values are stored in the upper bits of the
    293 ///    destination.
    294 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
    295 ///    sums of both operands.
    296 static __inline__ __m128i __DEFAULT_FN_ATTRS
    297 _mm_hadds_epi16(__m128i __a, __m128i __b)
    298 {
    299     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
    300 }
    301 
    302 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    303 ///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
    304 ///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
    305 ///
    306 /// \headerfile <x86intrin.h>
    307 ///
    308 /// This intrinsic corresponds to the \c PHADDSW instruction.
    309 ///
    310 /// \param __a
    311 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    312 ///    horizontal sums of the values are stored in the lower bits of the
    313 ///    destination.
    314 /// \param __b
    315 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    316 ///    horizontal sums of the values are stored in the upper bits of the
    317 ///    destination.
    318 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
    319 ///    sums of both operands.
    320 static __inline__ __m64 __DEFAULT_FN_ATTRS
    321 _mm_hadds_pi16(__m64 __a, __m64 __b)
    322 {
    323     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
    324 }
    325 
    326 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    327 ///    packed 128-bit vectors of [8 x i16].
    328 ///
    329 /// \headerfile <x86intrin.h>
    330 ///
    331 /// This intrinsic corresponds to the \c VPHSUBW instruction.
    332 ///
    333 /// \param __a
    334 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    335 ///    horizontal differences between the values are stored in the lower bits of
    336 ///    the destination.
    337 /// \param __b
    338 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    339 ///    horizontal differences between the values are stored in the upper bits of
    340 ///    the destination.
    341 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
    342 ///    of both operands.
    343 static __inline__ __m128i __DEFAULT_FN_ATTRS
    344 _mm_hsub_epi16(__m128i __a, __m128i __b)
    345 {
    346     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
    347 }
    348 
    349 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    350 ///    packed 128-bit vectors of [4 x i32].
    351 ///
    352 /// \headerfile <x86intrin.h>
    353 ///
    354 /// This intrinsic corresponds to the \c VPHSUBD instruction.
    355 ///
    356 /// \param __a
    357 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    358 ///    horizontal differences between the values are stored in the lower bits of
    359 ///    the destination.
    360 /// \param __b
    361 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    362 ///    horizontal differences between the values are stored in the upper bits of
    363 ///    the destination.
    364 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
    365 ///    of both operands.
    366 static __inline__ __m128i __DEFAULT_FN_ATTRS
    367 _mm_hsub_epi32(__m128i __a, __m128i __b)
    368 {
    369     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
    370 }
    371 
    372 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    373 ///    packed 64-bit vectors of [4 x i16].
    374 ///
    375 /// \headerfile <x86intrin.h>
    376 ///
    377 /// This intrinsic corresponds to the \c PHSUBW instruction.
    378 ///
    379 /// \param __a
    380 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    381 ///    horizontal differences between the values are stored in the lower bits of
    382 ///    the destination.
    383 /// \param __b
    384 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    385 ///    horizontal differences between the values are stored in the upper bits of
    386 ///    the destination.
    387 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
    388 ///    of both operands.
    389 static __inline__ __m64 __DEFAULT_FN_ATTRS
    390 _mm_hsub_pi16(__m64 __a, __m64 __b)
    391 {
    392     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
    393 }
    394 
    395 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    396 ///    packed 64-bit vectors of [2 x i32].
    397 ///
    398 /// \headerfile <x86intrin.h>
    399 ///
    400 /// This intrinsic corresponds to the \c PHSUBD instruction.
    401 ///
    402 /// \param __a
    403 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    404 ///    horizontal differences between the values are stored in the lower bits of
    405 ///    the destination.
    406 /// \param __b
    407 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    408 ///    horizontal differences between the values are stored in the upper bits of
    409 ///    the destination.
    410 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
    411 ///    of both operands.
    412 static __inline__ __m64 __DEFAULT_FN_ATTRS
    413 _mm_hsub_pi32(__m64 __a, __m64 __b)
    414 {
    415     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
    416 }
    417 
    418 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    419 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
    420 ///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
    421 ///    saturated to 8000h.
    422 ///
    423 /// \headerfile <x86intrin.h>
    424 ///
    425 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
    426 ///
    427 /// \param __a
    428 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    429 ///    horizontal differences between the values are stored in the lower bits of
    430 ///    the destination.
    431 /// \param __b
    432 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    433 ///    horizontal differences between the values are stored in the upper bits of
    434 ///    the destination.
    435 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
    436 ///    differences of both operands.
    437 static __inline__ __m128i __DEFAULT_FN_ATTRS
    438 _mm_hsubs_epi16(__m128i __a, __m128i __b)
    439 {
    440     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
    441 }
    442 
    443 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    444 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
    445 ///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
    446 ///    saturated to 8000h.
    447 ///
    448 /// \headerfile <x86intrin.h>
    449 ///
    450 /// This intrinsic corresponds to the \c PHSUBSW instruction.
    451 ///
    452 /// \param __a
    453 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    454 ///    horizontal differences between the values are stored in the lower bits of
    455 ///    the destination.
    456 /// \param __b
    457 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    458 ///    horizontal differences between the values are stored in the upper bits of
    459 ///    the destination.
    460 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
    461 ///    differences of both operands.
    462 static __inline__ __m64 __DEFAULT_FN_ATTRS
    463 _mm_hsubs_pi16(__m64 __a, __m64 __b)
    464 {
    465     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
    466 }
    467 
    468 /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
    469 ///    values contained in the first source operand and packed 8-bit signed
    470 ///    integer values contained in the second source operand, adds pairs of
    471 ///    contiguous products with signed saturation, and writes the 16-bit sums to
    472 ///    the corresponding bits in the destination. For example, bits [7:0] of
    473 ///    both operands are multiplied, bits [15:8] of both operands are
    474 ///    multiplied, and the sum of both results is written to bits [15:0] of the
    475 ///    destination.
    476 ///
    477 /// \headerfile <x86intrin.h>
    478 ///
    479 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
    480 ///
    481 /// \param __a
    482 ///    A 128-bit integer vector containing the first source operand.
    483 /// \param __b
    484 ///    A 128-bit integer vector containing the second source operand.
    485 /// \returns A 128-bit integer vector containing the sums of products of both
    486 ///    operands:
    487 ///    R0 := (__a0 * __b0) + (__a1 * __b1)
    488 ///    R1 := (__a2 * __b2) + (__a3 * __b3)
    489 ///    R2 := (__a4 * __b4) + (__a5 * __b5)
    490 ///    R3 := (__a6 * __b6) + (__a7 * __b7)
    491 ///    R4 := (__a8 * __b8) + (__a9 * __b9)
    492 ///    R5 := (__a10 * __b10) + (__a11 * __b11)
    493 ///    R6 := (__a12 * __b12) + (__a13 * __b13)
    494 ///    R7 := (__a14 * __b14) + (__a15 * __b15)
    495 static __inline__ __m128i __DEFAULT_FN_ATTRS
    496 _mm_maddubs_epi16(__m128i __a, __m128i __b)
    497 {
    498     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
    499 }
    500 
    501 /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
    502 ///    values contained in the first source operand and packed 8-bit signed
    503 ///    integer values contained in the second source operand, adds pairs of
    504 ///    contiguous products with signed saturation, and writes the 16-bit sums to
    505 ///    the corresponding bits in the destination. For example, bits [7:0] of
    506 ///    both operands are multiplied, bits [15:8] of both operands are
    507 ///    multiplied, and the sum of both results is written to bits [15:0] of the
    508 ///    destination.
    509 ///
    510 /// \headerfile <x86intrin.h>
    511 ///
    512 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
    513 ///
    514 /// \param __a
    515 ///    A 64-bit integer vector containing the first source operand.
    516 /// \param __b
    517 ///    A 64-bit integer vector containing the second source operand.
    518 /// \returns A 64-bit integer vector containing the sums of products of both
    519 ///    operands:
    520 ///    R0 := (__a0 * __b0) + (__a1 * __b1)
    521 ///    R1 := (__a2 * __b2) + (__a3 * __b3)
    522 ///    R2 := (__a4 * __b4) + (__a5 * __b5)
    523 ///    R3 := (__a6 * __b6) + (__a7 * __b7)
    524 static __inline__ __m64 __DEFAULT_FN_ATTRS
    525 _mm_maddubs_pi16(__m64 __a, __m64 __b)
    526 {
    527     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
    528 }
    529 
    530 /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
    531 ///    products to the 18 most significant bits by right-shifting, rounds the
    532 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
    533 ///
    534 /// \headerfile <x86intrin.h>
    535 ///
    536 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
    537 ///
    538 /// \param __a
    539 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
    540 /// \param __b
    541 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
    542 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
    543 ///    products of both operands.
    544 static __inline__ __m128i __DEFAULT_FN_ATTRS
    545 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
    546 {
    547     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
    548 }
    549 
    550 /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
    551 ///    products to the 18 most significant bits by right-shifting, rounds the
    552 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
    553 ///
    554 /// \headerfile <x86intrin.h>
    555 ///
    556 /// This intrinsic corresponds to the \c PMULHRSW instruction.
    557 ///
    558 /// \param __a
    559 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
    560 /// \param __b
    561 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
    562 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
    563 ///    products of both operands.
    564 static __inline__ __m64 __DEFAULT_FN_ATTRS
    565 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
    566 {
    567     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
    568 }
    569 
    570 /// \brief Copies the 8-bit integers from a 128-bit integer vector to the
    571 ///    destination or clears 8-bit values in the destination, as specified by
    572 ///    the second source operand.
    573 ///
    574 /// \headerfile <x86intrin.h>
    575 ///
    576 /// This intrinsic corresponds to the \c VPSHUFB instruction.
    577 ///
    578 /// \param __a
    579 ///    A 128-bit integer vector containing the values to be copied.
    580 /// \param __b
    581 ///    A 128-bit integer vector containing control bytes corresponding to
    582 ///    positions in the destination:
    583 ///    Bit 7:
    584 ///    1: Clear the corresponding byte in the destination.
    585 ///    0: Copy the selected source byte to the corresponding byte in the
    586 ///    destination.
    587 ///    Bits [6:4] Reserved.
    588 ///    Bits [3:0] select the source byte to be copied.
    589 /// \returns A 128-bit integer vector containing the copied or cleared values.
    590 static __inline__ __m128i __DEFAULT_FN_ATTRS
    591 _mm_shuffle_epi8(__m128i __a, __m128i __b)
    592 {
    593     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
    594 }
    595 
    596 /// \brief Copies the 8-bit integers from a 64-bit integer vector to the
    597 ///    destination or clears 8-bit values in the destination, as specified by
    598 ///    the second source operand.
    599 ///
    600 /// \headerfile <x86intrin.h>
    601 ///
    602 /// This intrinsic corresponds to the \c PSHUFB instruction.
    603 ///
    604 /// \param __a
    605 ///    A 64-bit integer vector containing the values to be copied.
    606 /// \param __b
    607 ///    A 64-bit integer vector containing control bytes corresponding to
    608 ///    positions in the destination:
    609 ///    Bit 7:
    610 ///    1: Clear the corresponding byte in the destination.
    611 ///    0: Copy the selected source byte to the corresponding byte in the
    612 ///    destination.
    613 ///    Bits [3:0] select the source byte to be copied.
    614 /// \returns A 64-bit integer vector containing the copied or cleared values.
    615 static __inline__ __m64 __DEFAULT_FN_ATTRS
    616 _mm_shuffle_pi8(__m64 __a, __m64 __b)
    617 {
    618     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
    619 }
    620 
    621 /// \brief For each 8-bit integer in the first source operand, perform one of
    622 ///    the following actions as specified by the second source operand: If the
    623 ///    byte in the second source is negative, calculate the two's complement of
    624 ///    the corresponding byte in the first source, and write that value to the
    625 ///    destination. If the byte in the second source is positive, copy the
    626 ///    corresponding byte from the first source to the destination. If the byte
    627 ///    in the second source is zero, clear the corresponding byte in the
    628 ///    destination.
    629 ///
    630 /// \headerfile <x86intrin.h>
    631 ///
    632 /// This intrinsic corresponds to the \c VPSIGNB instruction.
    633 ///
    634 /// \param __a
    635 ///    A 128-bit integer vector containing the values to be copied.
    636 /// \param __b
    637 ///    A 128-bit integer vector containing control bytes corresponding to
    638 ///    positions in the destination.
    639 /// \returns A 128-bit integer vector containing the resultant values.
    640 static __inline__ __m128i __DEFAULT_FN_ATTRS
    641 _mm_sign_epi8(__m128i __a, __m128i __b)
    642 {
    643     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
    644 }
    645 
    646 /// \brief For each 16-bit integer in the first source operand, perform one of
    647 ///    the following actions as specified by the second source operand: If the
    648 ///    word in the second source is negative, calculate the two's complement of
    649 ///    the corresponding word in the first source, and write that value to the
    650 ///    destination. If the word in the second source is positive, copy the
    651 ///    corresponding word from the first source to the destination. If the word
    652 ///    in the second source is zero, clear the corresponding word in the
    653 ///    destination.
    654 ///
    655 /// \headerfile <x86intrin.h>
    656 ///
    657 /// This intrinsic corresponds to the \c VPSIGNW instruction.
    658 ///
    659 /// \param __a
    660 ///    A 128-bit integer vector containing the values to be copied.
    661 /// \param __b
    662 ///    A 128-bit integer vector containing control words corresponding to
    663 ///    positions in the destination.
    664 /// \returns A 128-bit integer vector containing the resultant values.
    665 static __inline__ __m128i __DEFAULT_FN_ATTRS
    666 _mm_sign_epi16(__m128i __a, __m128i __b)
    667 {
    668     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
    669 }
    670 
    671 /// \brief For each 32-bit integer in the first source operand, perform one of
    672 ///    the following actions as specified by the second source operand: If the
    673 ///    doubleword in the second source is negative, calculate the two's
    674 ///    complement of the corresponding word in the first source, and write that
    675 ///    value to the destination. If the doubleword in the second source is
    676 ///    positive, copy the corresponding word from the first source to the
    677 ///    destination. If the doubleword in the second source is zero, clear the
    678 ///    corresponding word in the destination.
    679 ///
    680 /// \headerfile <x86intrin.h>
    681 ///
    682 /// This intrinsic corresponds to the \c VPSIGND instruction.
    683 ///
    684 /// \param __a
    685 ///    A 128-bit integer vector containing the values to be copied.
    686 /// \param __b
    687 ///    A 128-bit integer vector containing control doublewords corresponding to
    688 ///    positions in the destination.
    689 /// \returns A 128-bit integer vector containing the resultant values.
    690 static __inline__ __m128i __DEFAULT_FN_ATTRS
    691 _mm_sign_epi32(__m128i __a, __m128i __b)
    692 {
    693     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
    694 }
    695 
    696 /// \brief For each 8-bit integer in the first source operand, perform one of
    697 ///    the following actions as specified by the second source operand: If the
    698 ///    byte in the second source is negative, calculate the two's complement of
    699 ///    the corresponding byte in the first source, and write that value to the
    700 ///    destination. If the byte in the second source is positive, copy the
    701 ///    corresponding byte from the first source to the destination. If the byte
    702 ///    in the second source is zero, clear the corresponding byte in the
    703 ///    destination.
    704 ///
    705 /// \headerfile <x86intrin.h>
    706 ///
    707 /// This intrinsic corresponds to the \c PSIGNB instruction.
    708 ///
    709 /// \param __a
    710 ///    A 64-bit integer vector containing the values to be copied.
    711 /// \param __b
    712 ///    A 64-bit integer vector containing control bytes corresponding to
    713 ///    positions in the destination.
    714 /// \returns A 64-bit integer vector containing the resultant values.
    715 static __inline__ __m64 __DEFAULT_FN_ATTRS
    716 _mm_sign_pi8(__m64 __a, __m64 __b)
    717 {
    718     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
    719 }
    720 
    721 /// \brief For each 16-bit integer in the first source operand, perform one of
    722 ///    the following actions as specified by the second source operand: If the
    723 ///    word in the second source is negative, calculate the two's complement of
    724 ///    the corresponding word in the first source, and write that value to the
    725 ///    destination. If the word in the second source is positive, copy the
    726 ///    corresponding word from the first source to the destination. If the word
    727 ///    in the second source is zero, clear the corresponding word in the
    728 ///    destination.
    729 ///
    730 /// \headerfile <x86intrin.h>
    731 ///
    732 /// This intrinsic corresponds to the \c PSIGNW instruction.
    733 ///
    734 /// \param __a
    735 ///    A 64-bit integer vector containing the values to be copied.
    736 /// \param __b
    737 ///    A 64-bit integer vector containing control words corresponding to
    738 ///    positions in the destination.
    739 /// \returns A 64-bit integer vector containing the resultant values.
    740 static __inline__ __m64 __DEFAULT_FN_ATTRS
    741 _mm_sign_pi16(__m64 __a, __m64 __b)
    742 {
    743     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
    744 }
    745 
    746 /// \brief For each 32-bit integer in the first source operand, perform one of
    747 ///    the following actions as specified by the second source operand: If the
    748 ///    doubleword in the second source is negative, calculate the two's
    749 ///    complement of the corresponding doubleword in the first source, and
    750 ///    write that value to the destination. If the doubleword in the second
    751 ///    source is positive, copy the corresponding doubleword from the first
    752 ///    source to the destination. If the doubleword in the second source is
    753 ///    zero, clear the corresponding doubleword in the destination.
    754 ///
    755 /// \headerfile <x86intrin.h>
    756 ///
    757 /// This intrinsic corresponds to the \c PSIGND instruction.
    758 ///
    759 /// \param __a
    760 ///    A 64-bit integer vector containing the values to be copied.
    761 /// \param __b
    762 ///    A 64-bit integer vector containing two control doublewords corresponding
    763 ///    to positions in the destination.
    764 /// \returns A 64-bit integer vector containing the resultant values.
    765 static __inline__ __m64 __DEFAULT_FN_ATTRS
    766 _mm_sign_pi32(__m64 __a, __m64 __b)
    767 {
    768     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
    769 }
    770 
    771 #undef __DEFAULT_FN_ATTRS
    772 
    773 #endif /* __TMMINTRIN_H */
    774