Home | History | Annotate | Download | only in include
      1 /* Copyright (C) 2002-2013 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 /* Implemented from the specification included in the Intel C++ Compiler
     25    User Guide and Reference, version 9.0.  */
     26 
     27 #ifndef _MMINTRIN_H_INCLUDED
     28 #define _MMINTRIN_H_INCLUDED
     29 
     30 #ifndef __MMX__
     31 # error "MMX instruction set not enabled"
     32 #else
     33 /* The Intel API is flexible enough that we must allow aliasing with other
     34    vector types, and their scalar components.  */
     35 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
     36 
     37 /* Internal data types for implementing the intrinsics.  */
     38 typedef int __v2si __attribute__ ((__vector_size__ (8)));
     39 typedef short __v4hi __attribute__ ((__vector_size__ (8)));
     40 typedef char __v8qi __attribute__ ((__vector_size__ (8)));
     41 typedef long long __v1di __attribute__ ((__vector_size__ (8)));
     42 typedef float __v2sf __attribute__ ((__vector_size__ (8)));
     43 
     44 /* Empty the multimedia state.  */
     45 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     46 _mm_empty (void)
     47 {
     48   __builtin_ia32_emms ();
     49 }
     50 
     51 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     52 _m_empty (void)
     53 {
     54   _mm_empty ();
     55 }
     56 
     57 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
     58 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     59 _mm_cvtsi32_si64 (int __i)
     60 {
     61   return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
     62 }
     63 
     64 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     65 _m_from_int (int __i)
     66 {
     67   return _mm_cvtsi32_si64 (__i);
     68 }
     69 
     70 #ifdef __x86_64__
     71 /* Convert I to a __m64 object.  */
     72 
     73 /* Intel intrinsic.  */
     74 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     75 _m_from_int64 (long long __i)
     76 {
     77   return (__m64) __i;
     78 }
     79 
     80 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     81 _mm_cvtsi64_m64 (long long __i)
     82 {
     83   return (__m64) __i;
     84 }
     85 
     86 /* Microsoft intrinsic.  */
     87 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     88 _mm_cvtsi64x_si64 (long long __i)
     89 {
     90   return (__m64) __i;
     91 }
     92 
     93 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     94 _mm_set_pi64x (long long __i)
     95 {
     96   return (__m64) __i;
     97 }
     98 #endif
     99 
    100 /* Convert the lower 32 bits of the __m64 object into an integer.  */
    101 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    102 _mm_cvtsi64_si32 (__m64 __i)
    103 {
    104   return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
    105 }
    106 
    107 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    108 _m_to_int (__m64 __i)
    109 {
    110   return _mm_cvtsi64_si32 (__i);
    111 }
    112 
    113 #ifdef __x86_64__
    114 /* Convert the __m64 object to a 64bit integer.  */
    115 
    116 /* Intel intrinsic.  */
    117 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    118 _m_to_int64 (__m64 __i)
    119 {
    120   return (long long)__i;
    121 }
    122 
    123 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    124 _mm_cvtm64_si64 (__m64 __i)
    125 {
    126   return (long long)__i;
    127 }
    128 
    129 /* Microsoft intrinsic.  */
    130 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    131 _mm_cvtsi64_si64x (__m64 __i)
    132 {
    133   return (long long)__i;
    134 }
    135 #endif
    136 
    137 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    138    the result, and the four 16-bit values from M2 into the upper four 8-bit
    139    values of the result, all with signed saturation.  */
    140 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    141 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
    142 {
    143   return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
    144 }
    145 
    146 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    147 _m_packsswb (__m64 __m1, __m64 __m2)
    148 {
    149   return _mm_packs_pi16 (__m1, __m2);
    150 }
    151 
    152 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
    153    the result, and the two 32-bit values from M2 into the upper two 16-bit
    154    values of the result, all with signed saturation.  */
    155 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    156 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
    157 {
    158   return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
    159 }
    160 
    161 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    162 _m_packssdw (__m64 __m1, __m64 __m2)
    163 {
    164   return _mm_packs_pi32 (__m1, __m2);
    165 }
    166 
    167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    168    the result, and the four 16-bit values from M2 into the upper four 8-bit
    169    values of the result, all with unsigned saturation.  */
    170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    171 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
    172 {
    173   return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
    174 }
    175 
    176 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    177 _m_packuswb (__m64 __m1, __m64 __m2)
    178 {
    179   return _mm_packs_pu16 (__m1, __m2);
    180 }
    181 
    182 /* Interleave the four 8-bit values from the high half of M1 with the four
    183    8-bit values from the high half of M2.  */
    184 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    185 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
    186 {
    187   return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
    188 }
    189 
    190 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    191 _m_punpckhbw (__m64 __m1, __m64 __m2)
    192 {
    193   return _mm_unpackhi_pi8 (__m1, __m2);
    194 }
    195 
    196 /* Interleave the two 16-bit values from the high half of M1 with the two
    197    16-bit values from the high half of M2.  */
    198 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    199 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
    200 {
    201   return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
    202 }
    203 
    204 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    205 _m_punpckhwd (__m64 __m1, __m64 __m2)
    206 {
    207   return _mm_unpackhi_pi16 (__m1, __m2);
    208 }
    209 
    210 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
    211    value from the high half of M2.  */
    212 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    213 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
    214 {
    215   return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
    216 }
    217 
    218 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    219 _m_punpckhdq (__m64 __m1, __m64 __m2)
    220 {
    221   return _mm_unpackhi_pi32 (__m1, __m2);
    222 }
    223 
    224 /* Interleave the four 8-bit values from the low half of M1 with the four
    225    8-bit values from the low half of M2.  */
    226 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    227 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
    228 {
    229   return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
    230 }
    231 
    232 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    233 _m_punpcklbw (__m64 __m1, __m64 __m2)
    234 {
    235   return _mm_unpacklo_pi8 (__m1, __m2);
    236 }
    237 
    238 /* Interleave the two 16-bit values from the low half of M1 with the two
    239    16-bit values from the low half of M2.  */
    240 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    241 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
    242 {
    243   return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
    244 }
    245 
    246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    247 _m_punpcklwd (__m64 __m1, __m64 __m2)
    248 {
    249   return _mm_unpacklo_pi16 (__m1, __m2);
    250 }
    251 
    252 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
    253    value from the low half of M2.  */
    254 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    255 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
    256 {
    257   return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
    258 }
    259 
    260 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    261 _m_punpckldq (__m64 __m1, __m64 __m2)
    262 {
    263   return _mm_unpacklo_pi32 (__m1, __m2);
    264 }
    265 
    266 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
    267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    268 _mm_add_pi8 (__m64 __m1, __m64 __m2)
    269 {
    270   return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
    271 }
    272 
    273 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    274 _m_paddb (__m64 __m1, __m64 __m2)
    275 {
    276   return _mm_add_pi8 (__m1, __m2);
    277 }
    278 
    279 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
    280 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    281 _mm_add_pi16 (__m64 __m1, __m64 __m2)
    282 {
    283   return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
    284 }
    285 
    286 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    287 _m_paddw (__m64 __m1, __m64 __m2)
    288 {
    289   return _mm_add_pi16 (__m1, __m2);
    290 }
    291 
    292 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
    293 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    294 _mm_add_pi32 (__m64 __m1, __m64 __m2)
    295 {
    296   return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
    297 }
    298 
    299 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    300 _m_paddd (__m64 __m1, __m64 __m2)
    301 {
    302   return _mm_add_pi32 (__m1, __m2);
    303 }
    304 
    305 /* Add the 64-bit values in M1 to the 64-bit values in M2.  */
    306 #ifdef __SSE2__
    307 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    308 _mm_add_si64 (__m64 __m1, __m64 __m2)
    309 {
    310   return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
    311 }
    312 #endif
    313 
    314 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    315    saturated arithmetic.  */
    316 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    317 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
    318 {
    319   return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
    320 }
    321 
    322 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    323 _m_paddsb (__m64 __m1, __m64 __m2)
    324 {
    325   return _mm_adds_pi8 (__m1, __m2);
    326 }
    327 
    328 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
    329    saturated arithmetic.  */
    330 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    331 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
    332 {
    333   return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
    334 }
    335 
    336 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    337 _m_paddsw (__m64 __m1, __m64 __m2)
    338 {
    339   return _mm_adds_pi16 (__m1, __m2);
    340 }
    341 
    342 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
    343    saturated arithmetic.  */
    344 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    345 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
    346 {
    347   return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
    348 }
    349 
    350 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    351 _m_paddusb (__m64 __m1, __m64 __m2)
    352 {
    353   return _mm_adds_pu8 (__m1, __m2);
    354 }
    355 
    356 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
    357    saturated arithmetic.  */
    358 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    359 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
    360 {
    361   return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
    362 }
    363 
    364 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    365 _m_paddusw (__m64 __m1, __m64 __m2)
    366 {
    367   return _mm_adds_pu16 (__m1, __m2);
    368 }
    369 
    370 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
    371 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    372 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
    373 {
    374   return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
    375 }
    376 
    377 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    378 _m_psubb (__m64 __m1, __m64 __m2)
    379 {
    380   return _mm_sub_pi8 (__m1, __m2);
    381 }
    382 
    383 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
    384 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    385 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
    386 {
    387   return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
    388 }
    389 
    390 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    391 _m_psubw (__m64 __m1, __m64 __m2)
    392 {
    393   return _mm_sub_pi16 (__m1, __m2);
    394 }
    395 
    396 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
    397 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    398 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
    399 {
    400   return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
    401 }
    402 
    403 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    404 _m_psubd (__m64 __m1, __m64 __m2)
    405 {
    406   return _mm_sub_pi32 (__m1, __m2);
    407 }
    408 
    409 /* Add the 64-bit values in M1 to the 64-bit values in M2.  */
    410 #ifdef __SSE2__
    411 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    412 _mm_sub_si64 (__m64 __m1, __m64 __m2)
    413 {
    414   return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
    415 }
    416 #endif
    417 
    418 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
    419    saturating arithmetic.  */
    420 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    421 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
    422 {
    423   return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
    424 }
    425 
    426 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    427 _m_psubsb (__m64 __m1, __m64 __m2)
    428 {
    429   return _mm_subs_pi8 (__m1, __m2);
    430 }
    431 
    432 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
    433    signed saturating arithmetic.  */
    434 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    435 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
    436 {
    437   return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
    438 }
    439 
    440 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    441 _m_psubsw (__m64 __m1, __m64 __m2)
    442 {
    443   return _mm_subs_pi16 (__m1, __m2);
    444 }
    445 
    446 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
    447    unsigned saturating arithmetic.  */
    448 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    449 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
    450 {
    451   return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
    452 }
    453 
    454 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    455 _m_psubusb (__m64 __m1, __m64 __m2)
    456 {
    457   return _mm_subs_pu8 (__m1, __m2);
    458 }
    459 
    460 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
    461    unsigned saturating arithmetic.  */
    462 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    463 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
    464 {
    465   return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
    466 }
    467 
    468 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    469 _m_psubusw (__m64 __m1, __m64 __m2)
    470 {
    471   return _mm_subs_pu16 (__m1, __m2);
    472 }
    473 
    474 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
    475    four 32-bit intermediate results, which are then summed by pairs to
    476    produce two 32-bit results.  */
    477 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    478 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
    479 {
    480   return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
    481 }
    482 
    483 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    484 _m_pmaddwd (__m64 __m1, __m64 __m2)
    485 {
    486   return _mm_madd_pi16 (__m1, __m2);
    487 }
    488 
    489 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
    490    M2 and produce the high 16 bits of the 32-bit results.  */
    491 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    492 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
    493 {
    494   return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
    495 }
    496 
    497 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    498 _m_pmulhw (__m64 __m1, __m64 __m2)
    499 {
    500   return _mm_mulhi_pi16 (__m1, __m2);
    501 }
    502 
    503 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
    504    the low 16 bits of the results.  */
    505 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    506 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
    507 {
    508   return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
    509 }
    510 
    511 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    512 _m_pmullw (__m64 __m1, __m64 __m2)
    513 {
    514   return _mm_mullo_pi16 (__m1, __m2);
    515 }
    516 
    517 /* Shift four 16-bit values in M left by COUNT.  */
    518 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    519 _mm_sll_pi16 (__m64 __m, __m64 __count)
    520 {
    521   return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
    522 }
    523 
    524 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    525 _m_psllw (__m64 __m, __m64 __count)
    526 {
    527   return _mm_sll_pi16 (__m, __count);
    528 }
    529 
    530 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    531 _mm_slli_pi16 (__m64 __m, int __count)
    532 {
    533   return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
    534 }
    535 
    536 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    537 _m_psllwi (__m64 __m, int __count)
    538 {
    539   return _mm_slli_pi16 (__m, __count);
    540 }
    541 
    542 /* Shift two 32-bit values in M left by COUNT.  */
    543 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    544 _mm_sll_pi32 (__m64 __m, __m64 __count)
    545 {
    546   return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
    547 }
    548 
    549 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    550 _m_pslld (__m64 __m, __m64 __count)
    551 {
    552   return _mm_sll_pi32 (__m, __count);
    553 }
    554 
    555 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    556 _mm_slli_pi32 (__m64 __m, int __count)
    557 {
    558   return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
    559 }
    560 
    561 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    562 _m_pslldi (__m64 __m, int __count)
    563 {
    564   return _mm_slli_pi32 (__m, __count);
    565 }
    566 
    567 /* Shift the 64-bit value in M left by COUNT.  */
    568 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    569 _mm_sll_si64 (__m64 __m, __m64 __count)
    570 {
    571   return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
    572 }
    573 
    574 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    575 _m_psllq (__m64 __m, __m64 __count)
    576 {
    577   return _mm_sll_si64 (__m, __count);
    578 }
    579 
    580 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    581 _mm_slli_si64 (__m64 __m, int __count)
    582 {
    583   return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
    584 }
    585 
    586 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    587 _m_psllqi (__m64 __m, int __count)
    588 {
    589   return _mm_slli_si64 (__m, __count);
    590 }
    591 
    592 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
    593 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    594 _mm_sra_pi16 (__m64 __m, __m64 __count)
    595 {
    596   return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
    597 }
    598 
    599 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    600 _m_psraw (__m64 __m, __m64 __count)
    601 {
    602   return _mm_sra_pi16 (__m, __count);
    603 }
    604 
    605 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    606 _mm_srai_pi16 (__m64 __m, int __count)
    607 {
    608   return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
    609 }
    610 
    611 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    612 _m_psrawi (__m64 __m, int __count)
    613 {
    614   return _mm_srai_pi16 (__m, __count);
    615 }
    616 
    617 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
    618 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    619 _mm_sra_pi32 (__m64 __m, __m64 __count)
    620 {
    621   return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
    622 }
    623 
    624 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    625 _m_psrad (__m64 __m, __m64 __count)
    626 {
    627   return _mm_sra_pi32 (__m, __count);
    628 }
    629 
    630 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    631 _mm_srai_pi32 (__m64 __m, int __count)
    632 {
    633   return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
    634 }
    635 
    636 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    637 _m_psradi (__m64 __m, int __count)
    638 {
    639   return _mm_srai_pi32 (__m, __count);
    640 }
    641 
    642 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
    643 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    644 _mm_srl_pi16 (__m64 __m, __m64 __count)
    645 {
    646   return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
    647 }
    648 
    649 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    650 _m_psrlw (__m64 __m, __m64 __count)
    651 {
    652   return _mm_srl_pi16 (__m, __count);
    653 }
    654 
    655 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    656 _mm_srli_pi16 (__m64 __m, int __count)
    657 {
    658   return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
    659 }
    660 
    661 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    662 _m_psrlwi (__m64 __m, int __count)
    663 {
    664   return _mm_srli_pi16 (__m, __count);
    665 }
    666 
    667 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
    668 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    669 _mm_srl_pi32 (__m64 __m, __m64 __count)
    670 {
    671   return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
    672 }
    673 
    674 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    675 _m_psrld (__m64 __m, __m64 __count)
    676 {
    677   return _mm_srl_pi32 (__m, __count);
    678 }
    679 
    680 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    681 _mm_srli_pi32 (__m64 __m, int __count)
    682 {
    683   return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
    684 }
    685 
    686 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    687 _m_psrldi (__m64 __m, int __count)
    688 {
    689   return _mm_srli_pi32 (__m, __count);
    690 }
    691 
    692 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
    693 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    694 _mm_srl_si64 (__m64 __m, __m64 __count)
    695 {
    696   return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
    697 }
    698 
    699 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    700 _m_psrlq (__m64 __m, __m64 __count)
    701 {
    702   return _mm_srl_si64 (__m, __count);
    703 }
    704 
    705 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    706 _mm_srli_si64 (__m64 __m, int __count)
    707 {
    708   return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
    709 }
    710 
    711 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    712 _m_psrlqi (__m64 __m, int __count)
    713 {
    714   return _mm_srli_si64 (__m, __count);
    715 }
    716 
    717 /* Bit-wise AND the 64-bit values in M1 and M2.  */
    718 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    719 _mm_and_si64 (__m64 __m1, __m64 __m2)
    720 {
    721   return __builtin_ia32_pand (__m1, __m2);
    722 }
    723 
    724 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    725 _m_pand (__m64 __m1, __m64 __m2)
    726 {
    727   return _mm_and_si64 (__m1, __m2);
    728 }
    729 
    730 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
    731    64-bit value in M2.  */
    732 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    733 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
    734 {
    735   return __builtin_ia32_pandn (__m1, __m2);
    736 }
    737 
    738 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    739 _m_pandn (__m64 __m1, __m64 __m2)
    740 {
    741   return _mm_andnot_si64 (__m1, __m2);
    742 }
    743 
    744 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
    745 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    746 _mm_or_si64 (__m64 __m1, __m64 __m2)
    747 {
    748   return __builtin_ia32_por (__m1, __m2);
    749 }
    750 
    751 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    752 _m_por (__m64 __m1, __m64 __m2)
    753 {
    754   return _mm_or_si64 (__m1, __m2);
    755 }
    756 
    757 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
    758 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    759 _mm_xor_si64 (__m64 __m1, __m64 __m2)
    760 {
    761   return __builtin_ia32_pxor (__m1, __m2);
    762 }
    763 
    764 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    765 _m_pxor (__m64 __m1, __m64 __m2)
    766 {
    767   return _mm_xor_si64 (__m1, __m2);
    768 }
    769 
    770 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
    771    test is true and zero if false.  */
    772 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    773 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
    774 {
    775   return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
    776 }
    777 
    778 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    779 _m_pcmpeqb (__m64 __m1, __m64 __m2)
    780 {
    781   return _mm_cmpeq_pi8 (__m1, __m2);
    782 }
    783 
    784 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    785 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
    786 {
    787   return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
    788 }
    789 
    790 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    791 _m_pcmpgtb (__m64 __m1, __m64 __m2)
    792 {
    793   return _mm_cmpgt_pi8 (__m1, __m2);
    794 }
    795 
    796 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
    797    the test is true and zero if false.  */
    798 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    799 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
    800 {
    801   return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
    802 }
    803 
    804 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    805 _m_pcmpeqw (__m64 __m1, __m64 __m2)
    806 {
    807   return _mm_cmpeq_pi16 (__m1, __m2);
    808 }
    809 
    810 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    811 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
    812 {
    813   return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
    814 }
    815 
    816 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    817 _m_pcmpgtw (__m64 __m1, __m64 __m2)
    818 {
    819   return _mm_cmpgt_pi16 (__m1, __m2);
    820 }
    821 
    822 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
    823    the test is true and zero if false.  */
    824 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    825 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
    826 {
    827   return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
    828 }
    829 
    830 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    831 _m_pcmpeqd (__m64 __m1, __m64 __m2)
    832 {
    833   return _mm_cmpeq_pi32 (__m1, __m2);
    834 }
    835 
    836 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    837 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
    838 {
    839   return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
    840 }
    841 
    842 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    843 _m_pcmpgtd (__m64 __m1, __m64 __m2)
    844 {
    845   return _mm_cmpgt_pi32 (__m1, __m2);
    846 }
    847 
    848 /* Creates a 64-bit zero.  */
    849 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    850 _mm_setzero_si64 (void)
    851 {
    852   return (__m64)0LL;
    853 }
    854 
    855 /* Creates a vector of two 32-bit values; I0 is least significant.  */
    856 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    857 _mm_set_pi32 (int __i1, int __i0)
    858 {
    859   return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
    860 }
    861 
    862 /* Creates a vector of four 16-bit values; W0 is least significant.  */
    863 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    864 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
    865 {
    866   return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
    867 }
    868 
    869 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
    870 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    871 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
    872 	     char __b3, char __b2, char __b1, char __b0)
    873 {
    874   return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
    875 					       __b4, __b5, __b6, __b7);
    876 }
    877 
    878 /* Similar, but with the arguments in reverse order.  */
    879 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    880 _mm_setr_pi32 (int __i0, int __i1)
    881 {
    882   return _mm_set_pi32 (__i1, __i0);
    883 }
    884 
    885 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    886 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
    887 {
    888   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
    889 }
    890 
    891 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    892 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
    893 	      char __b4, char __b5, char __b6, char __b7)
    894 {
    895   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
    896 }
    897 
    898 /* Creates a vector of two 32-bit values, both elements containing I.  */
    899 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    900 _mm_set1_pi32 (int __i)
    901 {
    902   return _mm_set_pi32 (__i, __i);
    903 }
    904 
    905 /* Creates a vector of four 16-bit values, all elements containing W.  */
    906 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    907 _mm_set1_pi16 (short __w)
    908 {
    909   return _mm_set_pi16 (__w, __w, __w, __w);
    910 }
    911 
    912 /* Creates a vector of eight 8-bit values, all elements containing B.  */
    913 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    914 _mm_set1_pi8 (char __b)
    915 {
    916   return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
    917 }
    918 
    919 #endif /* __MMX__ */
    920 #endif /* _MMINTRIN_H_INCLUDED */
    921