1 /*===---- xopintrin.h - FMA4 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __X86INTRIN_H 25 #error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead." 26 #endif 27 28 #ifndef __XOPINTRIN_H 29 #define __XOPINTRIN_H 30 31 #ifndef __XOP__ 32 # error "XOP instruction set is not enabled" 33 #else 34 35 #include <fma4intrin.h> 36 37 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 38 _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) 39 { 40 return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); 41 } 42 43 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 44 _mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) 45 { 46 return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); 47 } 48 49 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 50 _mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) 51 { 52 return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); 53 } 54 55 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 56 _mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) 57 { 58 return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); 59 } 60 61 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 62 _mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) 63 { 64 return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C); 65 } 66 67 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 68 _mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) 69 { 70 return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C); 71 } 72 73 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 74 _mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) 75 { 76 return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C); 77 } 78 79 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 80 _mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) 81 { 82 return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C); 83 } 84 85 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 86 _mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) 87 { 88 return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C); 89 } 90 91 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 92 _mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) 93 { 94 return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C); 95 } 96 97 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 98 _mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) 99 { 100 return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); 101 } 102 103 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 104 _mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) 105 { 106 return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); 107 } 108 109 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 110 _mm_haddw_epi8(__m128i __A) 111 { 112 return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A); 113 } 114 115 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 116 _mm_haddd_epi8(__m128i __A) 117 { 118 return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A); 119 } 120 121 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 122 _mm_haddq_epi8(__m128i __A) 123 { 124 return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A); 125 } 126 127 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 128 _mm_haddd_epi16(__m128i __A) 129 { 130 return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A); 131 } 132 133 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 134 _mm_haddq_epi16(__m128i __A) 135 { 136 return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A); 137 } 138 139 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 140 _mm_haddq_epi32(__m128i __A) 141 { 142 return (__m128i)__builtin_ia32_vphadddq((__v4si)__A); 143 } 144 145 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 146 _mm_haddw_epu8(__m128i __A) 147 { 148 return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A); 149 } 150 151 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 152 _mm_haddd_epu8(__m128i __A) 153 { 154 return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A); 155 } 156 157 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 158 _mm_haddq_epu8(__m128i __A) 159 { 160 return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A); 161 } 162 163 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 164 _mm_haddd_epu16(__m128i __A) 165 { 166 return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A); 167 } 168 169 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 170 _mm_haddq_epu16(__m128i __A) 171 { 172 return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A); 173 } 174 175 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 176 _mm_haddq_epu32(__m128i __A) 177 { 178 return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A); 179 } 180 181 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 182 _mm_hsubw_epi8(__m128i __A) 183 { 184 return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A); 185 } 186 187 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 188 _mm_hsubd_epi16(__m128i __A) 189 { 190 return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A); 191 } 192 193 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 194 _mm_hsubq_epi32(__m128i __A) 195 { 196 return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A); 197 } 198 199 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 200 _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) 201 { 202 return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C); 203 } 204 205 static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 206 _mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C) 207 { 208 return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C); 209 } 210 211 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 212 _mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) 213 { 214 return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C); 215 } 216 217 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 218 _mm_rot_epi8(__m128i __A, __m128i __B) 219 { 220 return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B); 221 } 222 223 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 224 _mm_rot_epi16(__m128i __A, __m128i __B) 225 { 226 return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B); 227 } 228 229 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 230 _mm_rot_epi32(__m128i __A, __m128i __B) 231 { 232 return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B); 233 } 234 235 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 236 _mm_rot_epi64(__m128i __A, __m128i __B) 237 { 238 return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B); 239 } 240 241 #define _mm_roti_epi8(A, N) __extension__ ({ \ 242 __m128i __A = (A); \ 243 (__m128i)__builtin_ia32_vprotbi((__v16qi)__A, (N)); }) 244 245 #define _mm_roti_epi16(A, N) __extension__ ({ \ 246 __m128i __A = (A); \ 247 (__m128i)__builtin_ia32_vprotwi((__v8hi)__A, (N)); }) 248 249 #define _mm_roti_epi32(A, N) __extension__ ({ \ 250 __m128i __A = (A); \ 251 (__m128i)__builtin_ia32_vprotdi((__v4si)__A, (N)); }) 252 253 #define _mm_roti_epi64(A, N) __extension__ ({ \ 254 __m128i __A = (A); \ 255 (__m128i)__builtin_ia32_vprotqi((__v2di)__A, (N)); }) 256 257 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 258 _mm_shl_epi8(__m128i __A, __m128i __B) 259 { 260 return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B); 261 } 262 263 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 264 _mm_shl_epi16(__m128i __A, __m128i __B) 265 { 266 return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B); 267 } 268 269 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 270 _mm_shl_epi32(__m128i __A, __m128i __B) 271 { 272 return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B); 273 } 274 275 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 276 _mm_shl_epi64(__m128i __A, __m128i __B) 277 { 278 return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B); 279 } 280 281 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 282 _mm_sha_epi8(__m128i __A, __m128i __B) 283 { 284 return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B); 285 } 286 287 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 288 _mm_sha_epi16(__m128i __A, __m128i __B) 289 { 290 return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B); 291 } 292 293 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 294 _mm_sha_epi32(__m128i __A, __m128i __B) 295 { 296 return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B); 297 } 298 299 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 300 _mm_sha_epi64(__m128i __A, __m128i __B) 301 { 302 return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B); 303 } 304 305 #define _mm_com_epu8(A, B, N) __extension__ ({ \ 306 __m128i __A = (A); \ 307 __m128i __B = (B); \ 308 (__m128i)__builtin_ia32_vpcomub((__v16qi)__A, (__v16qi)__B, (N)); }) 309 310 #define _mm_com_epu16(A, B, N) __extension__ ({ \ 311 __m128i __A = (A); \ 312 __m128i __B = (B); \ 313 (__m128i)__builtin_ia32_vpcomuw((__v8hi)__A, (__v8hi)__B, (N)); }) 314 315 #define _mm_com_epu32(A, B, N) __extension__ ({ \ 316 __m128i __A = (A); \ 317 __m128i __B = (B); \ 318 (__m128i)__builtin_ia32_vpcomud((__v4si)__A, (__v4si)__B, (N)); }) 319 320 #define _mm_com_epu64(A, B, N) __extension__ ({ \ 321 __m128i __A = (A); \ 322 __m128i __B = (B); \ 323 (__m128i)__builtin_ia32_vpcomuq((__v2di)__A, (__v2di)__B, (N)); }) 324 325 #define _mm_com_epi8(A, B, N) __extension__ ({ \ 326 __m128i __A = (A); \ 327 __m128i __B = (B); \ 328 (__m128i)__builtin_ia32_vpcomb((__v16qi)__A, (__v16qi)__B, (N)); }) 329 330 #define _mm_com_epi16(A, B, N) __extension__ ({ \ 331 __m128i __A = (A); \ 332 __m128i __B = (B); \ 333 (__m128i)__builtin_ia32_vpcomw((__v8hi)__A, (__v8hi)__B, (N)); }) 334 335 #define _mm_com_epi32(A, B, N) __extension__ ({ \ 336 __m128i __A = (A); \ 337 __m128i __B = (B); \ 338 (__m128i)__builtin_ia32_vpcomd((__v4si)__A, (__v4si)__B, (N)); }) 339 340 #define _mm_com_epi64(A, B, N) __extension__ ({ \ 341 __m128i __A = (A); \ 342 __m128i __B = (B); \ 343 (__m128i)__builtin_ia32_vpcomq((__v2di)__A, (__v2di)__B, (N)); }) 344 345 #define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \ 346 __m128d __X = (X); \ 347 __m128d __Y = (Y); \ 348 __m128i __C = (C); \ 349 (__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, \ 350 (__v2di)__C, (I)); }) 351 352 #define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \ 353 __m256d __X = (X); \ 354 __m256d __Y = (Y); \ 355 __m256i __C = (C); \ 356 (__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, \ 357 (__v4di)__C, (I)); }) 358 359 #define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \ 360 __m128 __X = (X); \ 361 __m128 __Y = (Y); \ 362 __m128i __C = (C); \ 363 (__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, \ 364 (__v4si)__C, (I)); }) 365 366 #define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \ 367 __m256 __X = (X); \ 368 __m256 __Y = (Y); \ 369 __m256i __C = (C); \ 370 (__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, \ 371 (__v8si)__C, (I)); }) 372 373 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 374 _mm_frcz_ss(__m128 __A) 375 { 376 return (__m128)__builtin_ia32_vfrczss((__v4sf)__A); 377 } 378 379 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 380 _mm_frcz_sd(__m128d __A) 381 { 382 return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A); 383 } 384 385 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 386 _mm_frcz_ps(__m128 __A) 387 { 388 return (__m128)__builtin_ia32_vfrczps((__v4sf)__A); 389 } 390 391 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 392 _mm_frcz_pd(__m128d __A) 393 { 394 return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A); 395 } 396 397 static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) 398 _mm256_frcz_ps(__m256 __A) 399 { 400 return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A); 401 } 402 403 static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) 404 _mm256_frcz_pd(__m256d __A) 405 { 406 return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A); 407 } 408 409 #endif /* __XOP__ */ 410 411 #endif /* __XOPINTRIN_H */ 412