Home | History | Annotate | Download | only in include
      1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __TMMINTRIN_H
     25 #define __TMMINTRIN_H
     26 
     27 #include <pmmintrin.h>
     28 
     29 /* Define the default attributes for the functions in this file. */
     30 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
     31 
     32 /// \brief Computes the absolute value of each of the packed 8-bit signed
     33 ///    integers in the source operand and stores the 8-bit unsigned integer
     34 ///    results in the destination.
     35 ///
     36 /// \headerfile <x86intrin.h>
     37 ///
     38 /// This intrinsic corresponds to the \c PABSB instruction.
     39 ///
     40 /// \param __a
     41 ///    A 64-bit vector of [8 x i8].
     42 /// \returns A 64-bit integer vector containing the absolute values of the
     43 ///    elements in the operand.
     44 static __inline__ __m64 __DEFAULT_FN_ATTRS
     45 _mm_abs_pi8(__m64 __a)
     46 {
     47     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
     48 }
     49 
     50 /// \brief Computes the absolute value of each of the packed 8-bit signed
     51 ///    integers in the source operand and stores the 8-bit unsigned integer
     52 ///    results in the destination.
     53 ///
     54 /// \headerfile <x86intrin.h>
     55 ///
     56 /// This intrinsic corresponds to the \c VPABSB instruction.
     57 ///
     58 /// \param __a
     59 ///    A 128-bit vector of [16 x i8].
     60 /// \returns A 128-bit integer vector containing the absolute values of the
     61 ///    elements in the operand.
     62 static __inline__ __m128i __DEFAULT_FN_ATTRS
     63 _mm_abs_epi8(__m128i __a)
     64 {
     65     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
     66 }
     67 
     68 /// \brief Computes the absolute value of each of the packed 16-bit signed
     69 ///    integers in the source operand and stores the 16-bit unsigned integer
     70 ///    results in the destination.
     71 ///
     72 /// \headerfile <x86intrin.h>
     73 ///
     74 /// This intrinsic corresponds to the \c PABSW instruction.
     75 ///
     76 /// \param __a
     77 ///    A 64-bit vector of [4 x i16].
     78 /// \returns A 64-bit integer vector containing the absolute values of the
     79 ///    elements in the operand.
     80 static __inline__ __m64 __DEFAULT_FN_ATTRS
     81 _mm_abs_pi16(__m64 __a)
     82 {
     83     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
     84 }
     85 
     86 /// \brief Computes the absolute value of each of the packed 16-bit signed
     87 ///    integers in the source operand and stores the 16-bit unsigned integer
     88 ///    results in the destination.
     89 ///
     90 /// \headerfile <x86intrin.h>
     91 ///
     92 /// This intrinsic corresponds to the \c VPABSW instruction.
     93 ///
     94 /// \param __a
     95 ///    A 128-bit vector of [8 x i16].
     96 /// \returns A 128-bit integer vector containing the absolute values of the
     97 ///    elements in the operand.
     98 static __inline__ __m128i __DEFAULT_FN_ATTRS
     99 _mm_abs_epi16(__m128i __a)
    100 {
    101     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
    102 }
    103 
    104 /// \brief Computes the absolute value of each of the packed 32-bit signed
    105 ///    integers in the source operand and stores the 32-bit unsigned integer
    106 ///    results in the destination.
    107 ///
    108 /// \headerfile <x86intrin.h>
    109 ///
    110 /// This intrinsic corresponds to the \c PABSD instruction.
    111 ///
    112 /// \param __a
    113 ///    A 64-bit vector of [2 x i32].
    114 /// \returns A 64-bit integer vector containing the absolute values of the
    115 ///    elements in the operand.
    116 static __inline__ __m64 __DEFAULT_FN_ATTRS
    117 _mm_abs_pi32(__m64 __a)
    118 {
    119     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
    120 }
    121 
    122 /// \brief Computes the absolute value of each of the packed 32-bit signed
    123 ///    integers in the source operand and stores the 32-bit unsigned integer
    124 ///    results in the destination.
    125 ///
    126 /// \headerfile <x86intrin.h>
    127 ///
    128 /// This intrinsic corresponds to the \c VPABSD instruction.
    129 ///
    130 /// \param __a
    131 ///    A 128-bit vector of [4 x i32].
    132 /// \returns A 128-bit integer vector containing the absolute values of the
    133 ///    elements in the operand.
    134 static __inline__ __m128i __DEFAULT_FN_ATTRS
    135 _mm_abs_epi32(__m128i __a)
    136 {
    137     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
    138 }
    139 
    140 /// \brief Concatenates the two 128-bit integer vector operands, and
    141 ///    right-shifts the result by the number of bytes specified in the immediate
    142 ///    operand.
    143 ///
    144 /// \headerfile <x86intrin.h>
    145 ///
    146 /// \code
    147 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
    148 /// \endcode
    149 ///
    150 /// This intrinsic corresponds to the \c PALIGNR instruction.
    151 ///
    152 /// \param a
    153 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
    154 /// \param b
    155 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
    156 /// \param n
    157 ///    An immediate operand specifying how many bytes to right-shift the result.
    158 /// \returns A 128-bit integer vector containing the concatenated right-shifted
    159 ///    value.
    160 #define _mm_alignr_epi8(a, b, n) __extension__ ({ \
    161   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
    162                                      (__v16qi)(__m128i)(b), (n)); })
    163 
    164 /// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
    165 ///    the result by the number of bytes specified in the immediate operand.
    166 ///
    167 /// \headerfile <x86intrin.h>
    168 ///
    169 /// \code
    170 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
    171 /// \endcode
    172 ///
    173 /// This intrinsic corresponds to the \c PALIGNR instruction.
    174 ///
    175 /// \param a
    176 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
    177 /// \param b
    178 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
    179 /// \param n
    180 ///    An immediate operand specifying how many bytes to right-shift the result.
    181 /// \returns A 64-bit integer vector containing the concatenated right-shifted
    182 ///    value.
    183 #define _mm_alignr_pi8(a, b, n) __extension__ ({ \
    184   (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
    185 
    186 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    187 ///    128-bit vectors of [8 x i16].
    188 ///
    189 /// \headerfile <x86intrin.h>
    190 ///
    191 /// This intrinsic corresponds to the \c VPHADDW instruction.
    192 ///
    193 /// \param __a
    194 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    195 ///    horizontal sums of the values are stored in the lower bits of the
    196 ///    destination.
    197 /// \param __b
    198 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    199 ///    horizontal sums of the values are stored in the upper bits of the
    200 ///    destination.
    201 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
    202 ///    both operands.
    203 static __inline__ __m128i __DEFAULT_FN_ATTRS
    204 _mm_hadd_epi16(__m128i __a, __m128i __b)
    205 {
    206     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
    207 }
    208 
    209 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    210 ///    128-bit vectors of [4 x i32].
    211 ///
    212 /// \headerfile <x86intrin.h>
    213 ///
    214 /// This intrinsic corresponds to the \c VPHADDD instruction.
    215 ///
    216 /// \param __a
    217 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    218 ///    horizontal sums of the values are stored in the lower bits of the
    219 ///    destination.
    220 /// \param __b
    221 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    222 ///    horizontal sums of the values are stored in the upper bits of the
    223 ///    destination.
    224 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
    225 ///    both operands.
    226 static __inline__ __m128i __DEFAULT_FN_ATTRS
    227 _mm_hadd_epi32(__m128i __a, __m128i __b)
    228 {
    229     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
    230 }
    231 
    232 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    233 ///    64-bit vectors of [4 x i16].
    234 ///
    235 /// \headerfile <x86intrin.h>
    236 ///
    237 /// This intrinsic corresponds to the \c PHADDW instruction.
    238 ///
    239 /// \param __a
    240 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    241 ///    horizontal sums of the values are stored in the lower bits of the
    242 ///    destination.
    243 /// \param __b
    244 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    245 ///    horizontal sums of the values are stored in the upper bits of the
    246 ///    destination.
    247 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
    248 ///    operands.
    249 static __inline__ __m64 __DEFAULT_FN_ATTRS
    250 _mm_hadd_pi16(__m64 __a, __m64 __b)
    251 {
    252     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
    253 }
    254 
    255 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    256 ///    64-bit vectors of [2 x i32].
    257 ///
    258 /// \headerfile <x86intrin.h>
    259 ///
    260 /// This intrinsic corresponds to the \c PHADDD instruction.
    261 ///
    262 /// \param __a
    263 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    264 ///    horizontal sums of the values are stored in the lower bits of the
    265 ///    destination.
    266 /// \param __b
    267 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    268 ///    horizontal sums of the values are stored in the upper bits of the
    269 ///    destination.
    270 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
    271 ///    operands.
    272 static __inline__ __m64 __DEFAULT_FN_ATTRS
    273 _mm_hadd_pi32(__m64 __a, __m64 __b)
    274 {
    275     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
    276 }
    277 
    278 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    279 ///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
    280 ///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
    281 ///
    282 /// \headerfile <x86intrin.h>
    283 ///
    284 /// This intrinsic corresponds to the \c VPHADDSW instruction.
    285 ///
    286 /// \param __a
    287 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    288 ///    horizontal sums of the values are stored in the lower bits of the
    289 ///    destination.
    290 /// \param __b
    291 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    292 ///    horizontal sums of the values are stored in the upper bits of the
    293 ///    destination.
    294 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
    295 ///    sums of both operands.
    296 static __inline__ __m128i __DEFAULT_FN_ATTRS
    297 _mm_hadds_epi16(__m128i __a, __m128i __b)
    298 {
    299     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
    300 }
    301 
    302 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
    303 ///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
    304 ///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
    305 ///
    306 /// \headerfile <x86intrin.h>
    307 ///
    308 /// This intrinsic corresponds to the \c PHADDSW instruction.
    309 ///
    310 /// \param __a
    311 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    312 ///    horizontal sums of the values are stored in the lower bits of the
    313 ///    destination.
    314 /// \param __b
    315 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    316 ///    horizontal sums of the values are stored in the upper bits of the
    317 ///    destination.
    318 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
    319 ///    sums of both operands.
    320 static __inline__ __m64 __DEFAULT_FN_ATTRS
    321 _mm_hadds_pi16(__m64 __a, __m64 __b)
    322 {
    323     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
    324 }
    325 
    326 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    327 ///    packed 128-bit vectors of [8 x i16].
    328 ///
    329 /// \headerfile <x86intrin.h>
    330 ///
    331 /// This intrinsic corresponds to the \c VPHSUBW instruction.
    332 ///
    333 /// \param __a
    334 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    335 ///    horizontal differences between the values are stored in the lower bits of
    336 ///    the destination.
    337 /// \param __b
    338 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    339 ///    horizontal differences between the values are stored in the upper bits of
    340 ///    the destination.
    341 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
    342 ///    of both operands.
    343 static __inline__ __m128i __DEFAULT_FN_ATTRS
    344 _mm_hsub_epi16(__m128i __a, __m128i __b)
    345 {
    346     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
    347 }
    348 
    349 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    350 ///    packed 128-bit vectors of [4 x i32].
    351 ///
    352 /// \headerfile <x86intrin.h>
    353 ///
    354 /// This intrinsic corresponds to the \c VPHSUBD instruction.
    355 ///
    356 /// \param __a
    357 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    358 ///    horizontal differences between the values are stored in the lower bits of
    359 ///    the destination.
    360 /// \param __b
    361 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    362 ///    horizontal differences between the values are stored in the upper bits of
    363 ///    the destination.
    364 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
    365 ///    of both operands.
    366 static __inline__ __m128i __DEFAULT_FN_ATTRS
    367 _mm_hsub_epi32(__m128i __a, __m128i __b)
    368 {
    369     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
    370 }
    371 
    372 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    373 ///    packed 64-bit vectors of [4 x i16].
    374 ///
    375 /// \headerfile <x86intrin.h>
    376 ///
    377 /// This intrinsic corresponds to the \c PHSUBW instruction.
    378 ///
    379 /// \param __a
    380 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    381 ///    horizontal differences between the values are stored in the lower bits of
    382 ///    the destination.
    383 /// \param __b
    384 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    385 ///    horizontal differences between the values are stored in the upper bits of
    386 ///    the destination.
    387 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
    388 ///    of both operands.
    389 static __inline__ __m64 __DEFAULT_FN_ATTRS
    390 _mm_hsub_pi16(__m64 __a, __m64 __b)
    391 {
    392     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
    393 }
    394 
    395 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    396 ///    packed 64-bit vectors of [2 x i32].
    397 ///
    398 /// \headerfile <x86intrin.h>
    399 ///
    400 /// This intrinsic corresponds to the \c PHSUBD instruction.
    401 ///
    402 /// \param __a
    403 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    404 ///    horizontal differences between the values are stored in the lower bits of
    405 ///    the destination.
    406 /// \param __b
    407 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    408 ///    horizontal differences between the values are stored in the upper bits of
    409 ///    the destination.
    410 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
    411 ///    of both operands.
    412 static __inline__ __m64 __DEFAULT_FN_ATTRS
    413 _mm_hsub_pi32(__m64 __a, __m64 __b)
    414 {
    415     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
    416 }
    417 
    418 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    419 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
    420 ///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
    421 ///    saturated to 8000h.
    422 ///
    423 /// \headerfile <x86intrin.h>
    424 ///
    425 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
    426 ///
    427 /// \param __a
    428 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    429 ///    horizontal differences between the values are stored in the lower bits of
    430 ///    the destination.
    431 /// \param __b
    432 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    433 ///    horizontal differences between the values are stored in the upper bits of
    434 ///    the destination.
    435 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
    436 ///    differences of both operands.
    437 static __inline__ __m128i __DEFAULT_FN_ATTRS
    438 _mm_hsubs_epi16(__m128i __a, __m128i __b)
    439 {
    440     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
    441 }
    442 
    443 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
    444 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
    445 ///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
    446 ///    saturated to 8000h.
    447 ///
    448 /// \headerfile <x86intrin.h>
    449 ///
    450 /// This intrinsic corresponds to the \c PHSUBSW instruction.
    451 ///
    452 /// \param __a
    453 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    454 ///    horizontal differences between the values are stored in the lower bits of
    455 ///    the destination.
    456 /// \param __b
    457 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    458 ///    horizontal differences between the values are stored in the upper bits of
    459 ///    the destination.
    460 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
    461 ///    differences of both operands.
    462 static __inline__ __m64 __DEFAULT_FN_ATTRS
    463 _mm_hsubs_pi16(__m64 __a, __m64 __b)
    464 {
    465     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
    466 }
    467 
    468 /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
    469 ///    values contained in the first source operand and packed 8-bit signed
    470 ///    integer values contained in the second source operand, adds pairs of
    471 ///    contiguous products with signed saturation, and writes the 16-bit sums to
    472 ///    the corresponding bits in the destination.
    473 ///
    474 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
    475 ///    both operands are multiplied, and the sum of both results is written to
    476 ///    bits [15:0] of the destination.
    477 ///
    478 /// \headerfile <x86intrin.h>
    479 ///
    480 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
    481 ///
    482 /// \param __a
    483 ///    A 128-bit integer vector containing the first source operand.
    484 /// \param __b
    485 ///    A 128-bit integer vector containing the second source operand.
    486 /// \returns A 128-bit integer vector containing the sums of products of both
    487 ///    operands: \n
    488 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
    489 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
    490 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
    491 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
    492 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
    493 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
    494 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
    495 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
    496 static __inline__ __m128i __DEFAULT_FN_ATTRS
    497 _mm_maddubs_epi16(__m128i __a, __m128i __b)
    498 {
    499     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
    500 }
    501 
    502 /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
    503 ///    values contained in the first source operand and packed 8-bit signed
    504 ///    integer values contained in the second source operand, adds pairs of
    505 ///    contiguous products with signed saturation, and writes the 16-bit sums to
    506 ///    the corresponding bits in the destination.
    507 ///
    508 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
    509 ///    both operands are multiplied, and the sum of both results is written to
    510 ///    bits [15:0] of the destination.
    511 ///
    512 /// \headerfile <x86intrin.h>
    513 ///
    514 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
    515 ///
    516 /// \param __a
    517 ///    A 64-bit integer vector containing the first source operand.
    518 /// \param __b
    519 ///    A 64-bit integer vector containing the second source operand.
    520 /// \returns A 64-bit integer vector containing the sums of products of both
    521 ///    operands: \n
    522 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
    523 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
    524 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
    525 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
    526 static __inline__ __m64 __DEFAULT_FN_ATTRS
    527 _mm_maddubs_pi16(__m64 __a, __m64 __b)
    528 {
    529     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
    530 }
    531 
    532 /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
    533 ///    products to the 18 most significant bits by right-shifting, rounds the
    534 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
    535 ///
    536 /// \headerfile <x86intrin.h>
    537 ///
    538 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
    539 ///
    540 /// \param __a
    541 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
    542 /// \param __b
    543 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
    544 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
    545 ///    products of both operands.
    546 static __inline__ __m128i __DEFAULT_FN_ATTRS
    547 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
    548 {
    549     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
    550 }
    551 
    552 /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
    553 ///    products to the 18 most significant bits by right-shifting, rounds the
    554 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
    555 ///
    556 /// \headerfile <x86intrin.h>
    557 ///
    558 /// This intrinsic corresponds to the \c PMULHRSW instruction.
    559 ///
    560 /// \param __a
    561 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
    562 /// \param __b
    563 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
    564 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
    565 ///    products of both operands.
    566 static __inline__ __m64 __DEFAULT_FN_ATTRS
    567 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
    568 {
    569     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
    570 }
    571 
    572 /// \brief Copies the 8-bit integers from a 128-bit integer vector to the
    573 ///    destination or clears 8-bit values in the destination, as specified by
    574 ///    the second source operand.
    575 ///
    576 /// \headerfile <x86intrin.h>
    577 ///
    578 /// This intrinsic corresponds to the \c VPSHUFB instruction.
    579 ///
    580 /// \param __a
    581 ///    A 128-bit integer vector containing the values to be copied.
    582 /// \param __b
    583 ///    A 128-bit integer vector containing control bytes corresponding to
    584 ///    positions in the destination:
    585 ///    Bit 7: \n
    586 ///    1: Clear the corresponding byte in the destination. \n
    587 ///    0: Copy the selected source byte to the corresponding byte in the
    588 ///    destination. \n
    589 ///    Bits [6:4] Reserved.  \n
    590 ///    Bits [3:0] select the source byte to be copied.
    591 /// \returns A 128-bit integer vector containing the copied or cleared values.
    592 static __inline__ __m128i __DEFAULT_FN_ATTRS
    593 _mm_shuffle_epi8(__m128i __a, __m128i __b)
    594 {
    595     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
    596 }
    597 
    598 /// \brief Copies the 8-bit integers from a 64-bit integer vector to the
    599 ///    destination or clears 8-bit values in the destination, as specified by
    600 ///    the second source operand.
    601 ///
    602 /// \headerfile <x86intrin.h>
    603 ///
    604 /// This intrinsic corresponds to the \c PSHUFB instruction.
    605 ///
    606 /// \param __a
    607 ///    A 64-bit integer vector containing the values to be copied.
    608 /// \param __b
    609 ///    A 64-bit integer vector containing control bytes corresponding to
    610 ///    positions in the destination:
    611 ///    Bit 7: \n
    612 ///    1: Clear the corresponding byte in the destination. \n
    613 ///    0: Copy the selected source byte to the corresponding byte in the
    614 ///    destination. \n
    615 ///    Bits [3:0] select the source byte to be copied.
    616 /// \returns A 64-bit integer vector containing the copied or cleared values.
    617 static __inline__ __m64 __DEFAULT_FN_ATTRS
    618 _mm_shuffle_pi8(__m64 __a, __m64 __b)
    619 {
    620     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
    621 }
    622 
    623 /// \brief For each 8-bit integer in the first source operand, perform one of
    624 ///    the following actions as specified by the second source operand.
    625 ///
    626 ///    If the byte in the second source is negative, calculate the two's
    627 ///    complement of the corresponding byte in the first source, and write that
    628 ///    value to the destination. If the byte in the second source is positive,
    629 ///    copy the corresponding byte from the first source to the destination. If
    630 ///    the byte in the second source is zero, clear the corresponding byte in
    631 ///    the destination.
    632 ///
    633 /// \headerfile <x86intrin.h>
    634 ///
    635 /// This intrinsic corresponds to the \c VPSIGNB instruction.
    636 ///
    637 /// \param __a
    638 ///    A 128-bit integer vector containing the values to be copied.
    639 /// \param __b
    640 ///    A 128-bit integer vector containing control bytes corresponding to
    641 ///    positions in the destination.
    642 /// \returns A 128-bit integer vector containing the resultant values.
    643 static __inline__ __m128i __DEFAULT_FN_ATTRS
    644 _mm_sign_epi8(__m128i __a, __m128i __b)
    645 {
    646     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
    647 }
    648 
    649 /// \brief For each 16-bit integer in the first source operand, perform one of
    650 ///    the following actions as specified by the second source operand.
    651 ///
    652 ///    If the word in the second source is negative, calculate the two's
    653 ///    complement of the corresponding word in the first source, and write that
    654 ///    value to the destination. If the word in the second source is positive,
    655 ///    copy the corresponding word from the first source to the destination. If
    656 ///    the word in the second source is zero, clear the corresponding word in
    657 ///    the destination.
    658 ///
    659 /// \headerfile <x86intrin.h>
    660 ///
    661 /// This intrinsic corresponds to the \c VPSIGNW instruction.
    662 ///
    663 /// \param __a
    664 ///    A 128-bit integer vector containing the values to be copied.
    665 /// \param __b
    666 ///    A 128-bit integer vector containing control words corresponding to
    667 ///    positions in the destination.
    668 /// \returns A 128-bit integer vector containing the resultant values.
    669 static __inline__ __m128i __DEFAULT_FN_ATTRS
    670 _mm_sign_epi16(__m128i __a, __m128i __b)
    671 {
    672     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
    673 }
    674 
    675 /// \brief For each 32-bit integer in the first source operand, perform one of
    676 ///    the following actions as specified by the second source operand.
    677 ///
    678 ///    If the doubleword in the second source is negative, calculate the two's
    679 ///    complement of the corresponding word in the first source, and write that
    680 ///    value to the destination. If the doubleword in the second source is
    681 ///    positive, copy the corresponding word from the first source to the
    682 ///    destination. If the doubleword in the second source is zero, clear the
    683 ///    corresponding word in the destination.
    684 ///
    685 /// \headerfile <x86intrin.h>
    686 ///
    687 /// This intrinsic corresponds to the \c VPSIGND instruction.
    688 ///
    689 /// \param __a
    690 ///    A 128-bit integer vector containing the values to be copied.
    691 /// \param __b
    692 ///    A 128-bit integer vector containing control doublewords corresponding to
    693 ///    positions in the destination.
    694 /// \returns A 128-bit integer vector containing the resultant values.
    695 static __inline__ __m128i __DEFAULT_FN_ATTRS
    696 _mm_sign_epi32(__m128i __a, __m128i __b)
    697 {
    698     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
    699 }
    700 
    701 /// \brief For each 8-bit integer in the first source operand, perform one of
    702 ///    the following actions as specified by the second source operand.
    703 ///
    704 ///    If the byte in the second source is negative, calculate the two's
    705 ///    complement of the corresponding byte in the first source, and write that
    706 ///    value to the destination. If the byte in the second source is positive,
    707 ///    copy the corresponding byte from the first source to the destination. If
    708 ///    the byte in the second source is zero, clear the corresponding byte in
    709 ///    the destination.
    710 ///
    711 /// \headerfile <x86intrin.h>
    712 ///
    713 /// This intrinsic corresponds to the \c PSIGNB instruction.
    714 ///
    715 /// \param __a
    716 ///    A 64-bit integer vector containing the values to be copied.
    717 /// \param __b
    718 ///    A 64-bit integer vector containing control bytes corresponding to
    719 ///    positions in the destination.
    720 /// \returns A 64-bit integer vector containing the resultant values.
    721 static __inline__ __m64 __DEFAULT_FN_ATTRS
    722 _mm_sign_pi8(__m64 __a, __m64 __b)
    723 {
    724     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
    725 }
    726 
    727 /// \brief For each 16-bit integer in the first source operand, perform one of
    728 ///    the following actions as specified by the second source operand.
    729 ///
    730 ///    If the word in the second source is negative, calculate the two's
    731 ///    complement of the corresponding word in the first source, and write that
    732 ///    value to the destination. If the word in the second source is positive,
    733 ///    copy the corresponding word from the first source to the destination. If
    734 ///    the word in the second source is zero, clear the corresponding word in
    735 ///    the destination.
    736 ///
    737 /// \headerfile <x86intrin.h>
    738 ///
    739 /// This intrinsic corresponds to the \c PSIGNW instruction.
    740 ///
    741 /// \param __a
    742 ///    A 64-bit integer vector containing the values to be copied.
    743 /// \param __b
    744 ///    A 64-bit integer vector containing control words corresponding to
    745 ///    positions in the destination.
    746 /// \returns A 64-bit integer vector containing the resultant values.
    747 static __inline__ __m64 __DEFAULT_FN_ATTRS
    748 _mm_sign_pi16(__m64 __a, __m64 __b)
    749 {
    750     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
    751 }
    752 
    753 /// \brief For each 32-bit integer in the first source operand, perform one of
    754 ///    the following actions as specified by the second source operand.
    755 ///
    756 ///    If the doubleword in the second source is negative, calculate the two's
    757 ///    complement of the corresponding doubleword in the first source, and
    758 ///    write that value to the destination. If the doubleword in the second
    759 ///    source is positive, copy the corresponding doubleword from the first
    760 ///    source to the destination. If the doubleword in the second source is
    761 ///    zero, clear the corresponding doubleword in the destination.
    762 ///
    763 /// \headerfile <x86intrin.h>
    764 ///
    765 /// This intrinsic corresponds to the \c PSIGND instruction.
    766 ///
    767 /// \param __a
    768 ///    A 64-bit integer vector containing the values to be copied.
    769 /// \param __b
    770 ///    A 64-bit integer vector containing two control doublewords corresponding
    771 ///    to positions in the destination.
    772 /// \returns A 64-bit integer vector containing the resultant values.
    773 static __inline__ __m64 __DEFAULT_FN_ATTRS
    774 _mm_sign_pi32(__m64 __a, __m64 __b)
    775 {
    776     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
    777 }
    778 
    779 #undef __DEFAULT_FN_ATTRS
    780 
    781 #endif /* __TMMINTRIN_H */
    782