Home | History | Annotate | Download | only in include
      1 /*===---- xopintrin.h - XOP intrinsics -------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __X86INTRIN_H
     25 #error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
     26 #endif
     27 
     28 #ifndef __XOPINTRIN_H
     29 #define __XOPINTRIN_H
     30 
     31 #ifndef __XOP__
     32 # error "XOP instruction set is not enabled"
     33 #else
     34 
     35 #include <fma4intrin.h>
     36 
     37 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     38 _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
     39 {
     40   return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
     41 }
     42 
     43 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     44 _mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
     45 {
     46   return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
     47 }
     48 
     49 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     50 _mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
     51 {
     52   return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
     53 }
     54 
     55 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     56 _mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
     57 {
     58   return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
     59 }
     60 
     61 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     62 _mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
     63 {
     64   return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
     65 }
     66 
     67 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     68 _mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
     69 {
     70   return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
     71 }
     72 
     73 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     74 _mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
     75 {
     76   return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
     77 }
     78 
     79 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     80 _mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
     81 {
     82   return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
     83 }
     84 
     85 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     86 _mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
     87 {
     88   return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
     89 }
     90 
     91 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     92 _mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
     93 {
     94   return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
     95 }
     96 
     97 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
     98 _mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
     99 {
    100   return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
    101 }
    102 
    103 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    104 _mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
    105 {
    106   return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
    107 }
    108 
    109 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    110 _mm_haddw_epi8(__m128i __A)
    111 {
    112   return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
    113 }
    114 
    115 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    116 _mm_haddd_epi8(__m128i __A)
    117 {
    118   return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
    119 }
    120 
    121 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    122 _mm_haddq_epi8(__m128i __A)
    123 {
    124   return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
    125 }
    126 
    127 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    128 _mm_haddd_epi16(__m128i __A)
    129 {
    130   return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
    131 }
    132 
    133 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    134 _mm_haddq_epi16(__m128i __A)
    135 {
    136   return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
    137 }
    138 
    139 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    140 _mm_haddq_epi32(__m128i __A)
    141 {
    142   return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
    143 }
    144 
    145 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    146 _mm_haddw_epu8(__m128i __A)
    147 {
    148   return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
    149 }
    150 
    151 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    152 _mm_haddd_epu8(__m128i __A)
    153 {
    154   return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
    155 }
    156 
    157 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    158 _mm_haddq_epu8(__m128i __A)
    159 {
    160   return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
    161 }
    162 
    163 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    164 _mm_haddd_epu16(__m128i __A)
    165 {
    166   return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
    167 }
    168 
    169 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    170 _mm_haddq_epu16(__m128i __A)
    171 {
    172   return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
    173 }
    174 
    175 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    176 _mm_haddq_epu32(__m128i __A)
    177 {
    178   return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
    179 }
    180 
    181 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    182 _mm_hsubw_epi8(__m128i __A)
    183 {
    184   return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
    185 }
    186 
    187 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    188 _mm_hsubd_epi16(__m128i __A)
    189 {
    190   return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
    191 }
    192 
    193 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    194 _mm_hsubq_epi32(__m128i __A)
    195 {
    196   return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
    197 }
    198 
    199 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    200 _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
    201 {
    202   return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C);
    203 }
    204 
    205 static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
    206 _mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
    207 {
    208   return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C);
    209 }
    210 
    211 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    212 _mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
    213 {
    214   return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
    215 }
    216 
    217 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    218 _mm_rot_epi8(__m128i __A, __m128i __B)
    219 {
    220   return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
    221 }
    222 
    223 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    224 _mm_rot_epi16(__m128i __A, __m128i __B)
    225 {
    226   return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
    227 }
    228 
    229 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    230 _mm_rot_epi32(__m128i __A, __m128i __B)
    231 {
    232   return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
    233 }
    234 
    235 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    236 _mm_rot_epi64(__m128i __A, __m128i __B)
    237 {
    238   return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
    239 }
    240 
    241 #define _mm_roti_epi8(A, N) __extension__ ({ \
    242   __m128i __A = (A); \
    243   (__m128i)__builtin_ia32_vprotbi((__v16qi)__A, (N)); })
    244 
    245 #define _mm_roti_epi16(A, N) __extension__ ({ \
    246   __m128i __A = (A); \
    247   (__m128i)__builtin_ia32_vprotwi((__v8hi)__A, (N)); })
    248 
    249 #define _mm_roti_epi32(A, N) __extension__ ({ \
    250   __m128i __A = (A); \
    251   (__m128i)__builtin_ia32_vprotdi((__v4si)__A, (N)); })
    252 
    253 #define _mm_roti_epi64(A, N) __extension__ ({ \
    254   __m128i __A = (A); \
    255   (__m128i)__builtin_ia32_vprotqi((__v2di)__A, (N)); })
    256 
    257 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    258 _mm_shl_epi8(__m128i __A, __m128i __B)
    259 {
    260   return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
    261 }
    262 
    263 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    264 _mm_shl_epi16(__m128i __A, __m128i __B)
    265 {
    266   return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
    267 }
    268 
    269 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    270 _mm_shl_epi32(__m128i __A, __m128i __B)
    271 {
    272   return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
    273 }
    274 
    275 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    276 _mm_shl_epi64(__m128i __A, __m128i __B)
    277 {
    278   return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
    279 }
    280 
    281 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    282 _mm_sha_epi8(__m128i __A, __m128i __B)
    283 {
    284   return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
    285 }
    286 
    287 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    288 _mm_sha_epi16(__m128i __A, __m128i __B)
    289 {
    290   return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
    291 }
    292 
    293 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    294 _mm_sha_epi32(__m128i __A, __m128i __B)
    295 {
    296   return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
    297 }
    298 
    299 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    300 _mm_sha_epi64(__m128i __A, __m128i __B)
    301 {
    302   return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
    303 }
    304 
    305 #define _mm_com_epu8(A, B, N) __extension__ ({ \
    306   __m128i __A = (A); \
    307   __m128i __B = (B); \
    308   (__m128i)__builtin_ia32_vpcomub((__v16qi)__A, (__v16qi)__B, (N)); })
    309 
    310 #define _mm_com_epu16(A, B, N) __extension__ ({ \
    311   __m128i __A = (A); \
    312   __m128i __B = (B); \
    313   (__m128i)__builtin_ia32_vpcomuw((__v8hi)__A, (__v8hi)__B, (N)); })
    314 
    315 #define _mm_com_epu32(A, B, N) __extension__ ({ \
    316   __m128i __A = (A); \
    317   __m128i __B = (B); \
    318   (__m128i)__builtin_ia32_vpcomud((__v4si)__A, (__v4si)__B, (N)); })
    319 
    320 #define _mm_com_epu64(A, B, N) __extension__ ({ \
    321   __m128i __A = (A); \
    322   __m128i __B = (B); \
    323   (__m128i)__builtin_ia32_vpcomuq((__v2di)__A, (__v2di)__B, (N)); })
    324 
    325 #define _mm_com_epi8(A, B, N) __extension__ ({ \
    326   __m128i __A = (A); \
    327   __m128i __B = (B); \
    328   (__m128i)__builtin_ia32_vpcomb((__v16qi)__A, (__v16qi)__B, (N)); })
    329 
    330 #define _mm_com_epi16(A, B, N) __extension__ ({ \
    331   __m128i __A = (A); \
    332   __m128i __B = (B); \
    333   (__m128i)__builtin_ia32_vpcomw((__v8hi)__A, (__v8hi)__B, (N)); })
    334 
    335 #define _mm_com_epi32(A, B, N) __extension__ ({ \
    336   __m128i __A = (A); \
    337   __m128i __B = (B); \
    338   (__m128i)__builtin_ia32_vpcomd((__v4si)__A, (__v4si)__B, (N)); })
    339 
    340 #define _mm_com_epi64(A, B, N) __extension__ ({ \
    341   __m128i __A = (A); \
    342   __m128i __B = (B); \
    343   (__m128i)__builtin_ia32_vpcomq((__v2di)__A, (__v2di)__B, (N)); })
    344 
    345 #define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
    346   __m128d __X = (X); \
    347   __m128d __Y = (Y); \
    348   __m128i __C = (C); \
    349   (__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, \
    350                                      (__v2di)__C, (I)); })
    351 
    352 #define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
    353   __m256d __X = (X); \
    354   __m256d __Y = (Y); \
    355   __m256i __C = (C); \
    356   (__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, \
    357                                         (__v4di)__C, (I)); })
    358 
    359 #define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
    360   __m128 __X = (X); \
    361   __m128 __Y = (Y); \
    362   __m128i __C = (C); \
    363   (__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, \
    364                                     (__v4si)__C, (I)); })
    365 
    366 #define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
    367   __m256 __X = (X); \
    368   __m256 __Y = (Y); \
    369   __m256i __C = (C); \
    370   (__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, \
    371                                        (__v8si)__C, (I)); })
    372 
    373 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    374 _mm_frcz_ss(__m128 __A)
    375 {
    376   return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
    377 }
    378 
    379 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    380 _mm_frcz_sd(__m128d __A)
    381 {
    382   return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
    383 }
    384 
    385 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    386 _mm_frcz_ps(__m128 __A)
    387 {
    388   return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
    389 }
    390 
    391 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    392 _mm_frcz_pd(__m128d __A)
    393 {
    394   return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
    395 }
    396 
    397 static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
    398 _mm256_frcz_ps(__m256 __A)
    399 {
    400   return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
    401 }
    402 
    403 static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
    404 _mm256_frcz_pd(__m256d __A)
    405 {
    406   return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
    407 }
    408 
    409 #endif /* __XOP__ */
    410 
    411 #endif /* __XOPINTRIN_H */
    412