1 /* Copyright (C) 2008-2014 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 11.0. */ 26 27 #ifndef _IMMINTRIN_H_INCLUDED 28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 29 #endif 30 31 #ifndef _AVXINTRIN_H_INCLUDED 32 #define _AVXINTRIN_H_INCLUDED 33 34 #ifndef __AVX__ 35 #pragma GCC push_options 36 #pragma GCC target("avx") 37 #define __DISABLE_AVX__ 38 #endif /* __AVX__ */ 39 40 /* Internal data types for implementing the intrinsics. */ 41 typedef double __v4df __attribute__ ((__vector_size__ (32))); 42 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 43 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 44 typedef int __v8si __attribute__ ((__vector_size__ (32))); 45 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 46 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 47 48 /* The Intel API is flexible enough that we must allow aliasing with other 49 vector types, and their scalar components. */ 50 typedef float __m256 __attribute__ ((__vector_size__ (32), 51 __may_alias__)); 52 typedef long long __m256i __attribute__ ((__vector_size__ (32), 53 __may_alias__)); 54 typedef double __m256d __attribute__ ((__vector_size__ (32), 55 __may_alias__)); 56 57 /* Compare predicates for scalar and packed compare intrinsics. */ 58 59 /* Equal (ordered, non-signaling) */ 60 #define _CMP_EQ_OQ 0x00 61 /* Less-than (ordered, signaling) */ 62 #define _CMP_LT_OS 0x01 63 /* Less-than-or-equal (ordered, signaling) */ 64 #define _CMP_LE_OS 0x02 65 /* Unordered (non-signaling) */ 66 #define _CMP_UNORD_Q 0x03 67 /* Not-equal (unordered, non-signaling) */ 68 #define _CMP_NEQ_UQ 0x04 69 /* Not-less-than (unordered, signaling) */ 70 #define _CMP_NLT_US 0x05 71 /* Not-less-than-or-equal (unordered, signaling) */ 72 #define _CMP_NLE_US 0x06 73 /* Ordered (nonsignaling) */ 74 #define _CMP_ORD_Q 0x07 75 /* Equal (unordered, non-signaling) */ 76 #define _CMP_EQ_UQ 0x08 77 /* Not-greater-than-or-equal (unordered, signaling) */ 78 #define _CMP_NGE_US 0x09 79 /* Not-greater-than (unordered, signaling) */ 80 #define _CMP_NGT_US 0x0a 81 /* False (ordered, non-signaling) */ 82 #define _CMP_FALSE_OQ 0x0b 83 /* Not-equal (ordered, non-signaling) */ 84 #define _CMP_NEQ_OQ 0x0c 85 /* Greater-than-or-equal (ordered, signaling) */ 86 #define _CMP_GE_OS 0x0d 87 /* Greater-than (ordered, signaling) */ 88 #define _CMP_GT_OS 0x0e 89 /* True (unordered, non-signaling) */ 90 #define _CMP_TRUE_UQ 0x0f 91 /* Equal (ordered, signaling) */ 92 #define _CMP_EQ_OS 0x10 93 /* Less-than (ordered, non-signaling) */ 94 #define _CMP_LT_OQ 0x11 95 /* Less-than-or-equal (ordered, non-signaling) */ 96 #define _CMP_LE_OQ 0x12 97 /* Unordered (signaling) */ 98 #define _CMP_UNORD_S 0x13 99 /* Not-equal (unordered, signaling) */ 100 #define _CMP_NEQ_US 0x14 101 /* Not-less-than (unordered, non-signaling) */ 102 #define _CMP_NLT_UQ 0x15 103 /* Not-less-than-or-equal (unordered, non-signaling) */ 104 #define _CMP_NLE_UQ 0x16 105 /* Ordered (signaling) */ 106 #define _CMP_ORD_S 0x17 107 /* Equal (unordered, signaling) */ 108 #define _CMP_EQ_US 0x18 109 /* Not-greater-than-or-equal (unordered, non-signaling) */ 110 #define _CMP_NGE_UQ 0x19 111 /* Not-greater-than (unordered, non-signaling) */ 112 #define _CMP_NGT_UQ 0x1a 113 /* False (ordered, signaling) */ 114 #define _CMP_FALSE_OS 0x1b 115 /* Not-equal (ordered, signaling) */ 116 #define _CMP_NEQ_OS 0x1c 117 /* Greater-than-or-equal (ordered, non-signaling) */ 118 #define _CMP_GE_OQ 0x1d 119 /* Greater-than (ordered, non-signaling) */ 120 #define _CMP_GT_OQ 0x1e 121 /* True (unordered, signaling) */ 122 #define _CMP_TRUE_US 0x1f 123 124 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 125 _mm256_add_pd (__m256d __A, __m256d __B) 126 { 127 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B); 128 } 129 130 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 131 _mm256_add_ps (__m256 __A, __m256 __B) 132 { 133 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B); 134 } 135 136 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 137 _mm256_addsub_pd (__m256d __A, __m256d __B) 138 { 139 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); 140 } 141 142 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 143 _mm256_addsub_ps (__m256 __A, __m256 __B) 144 { 145 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B); 146 } 147 148 149 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 150 _mm256_and_pd (__m256d __A, __m256d __B) 151 { 152 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B); 153 } 154 155 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 156 _mm256_and_ps (__m256 __A, __m256 __B) 157 { 158 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B); 159 } 160 161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 162 _mm256_andnot_pd (__m256d __A, __m256d __B) 163 { 164 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B); 165 } 166 167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 168 _mm256_andnot_ps (__m256 __A, __m256 __B) 169 { 170 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); 171 } 172 173 /* Double/single precision floating point blend instructions - select 174 data from 2 sources using constant/variable mask. */ 175 176 #ifdef __OPTIMIZE__ 177 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 178 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) 179 { 180 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, 181 (__v4df)__Y, 182 __M); 183 } 184 185 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 186 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) 187 { 188 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, 189 (__v8sf)__Y, 190 __M); 191 } 192 #else 193 #define _mm256_blend_pd(X, Y, M) \ 194 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \ 195 (__v4df)(__m256d)(Y), (int)(M))) 196 197 #define _mm256_blend_ps(X, Y, M) \ 198 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \ 199 (__v8sf)(__m256)(Y), (int)(M))) 200 #endif 201 202 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 203 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) 204 { 205 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X, 206 (__v4df)__Y, 207 (__v4df)__M); 208 } 209 210 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 211 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) 212 { 213 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, 214 (__v8sf)__Y, 215 (__v8sf)__M); 216 } 217 218 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 219 _mm256_div_pd (__m256d __A, __m256d __B) 220 { 221 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B); 222 } 223 224 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 225 _mm256_div_ps (__m256 __A, __m256 __B) 226 { 227 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B); 228 } 229 230 /* Dot product instructions with mask-defined summing and zeroing parts 231 of result. */ 232 233 #ifdef __OPTIMIZE__ 234 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 235 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) 236 { 237 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, 238 (__v8sf)__Y, 239 __M); 240 } 241 #else 242 #define _mm256_dp_ps(X, Y, M) \ 243 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \ 244 (__v8sf)(__m256)(Y), (int)(M))) 245 #endif 246 247 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 248 _mm256_hadd_pd (__m256d __X, __m256d __Y) 249 { 250 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y); 251 } 252 253 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 254 _mm256_hadd_ps (__m256 __X, __m256 __Y) 255 { 256 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y); 257 } 258 259 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 260 _mm256_hsub_pd (__m256d __X, __m256d __Y) 261 { 262 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y); 263 } 264 265 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 266 _mm256_hsub_ps (__m256 __X, __m256 __Y) 267 { 268 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y); 269 } 270 271 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 272 _mm256_max_pd (__m256d __A, __m256d __B) 273 { 274 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B); 275 } 276 277 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 278 _mm256_max_ps (__m256 __A, __m256 __B) 279 { 280 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B); 281 } 282 283 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 284 _mm256_min_pd (__m256d __A, __m256d __B) 285 { 286 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B); 287 } 288 289 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 290 _mm256_min_ps (__m256 __A, __m256 __B) 291 { 292 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); 293 } 294 295 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 296 _mm256_mul_pd (__m256d __A, __m256d __B) 297 { 298 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B); 299 } 300 301 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 302 _mm256_mul_ps (__m256 __A, __m256 __B) 303 { 304 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B); 305 } 306 307 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 308 _mm256_or_pd (__m256d __A, __m256d __B) 309 { 310 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); 311 } 312 313 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 314 _mm256_or_ps (__m256 __A, __m256 __B) 315 { 316 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); 317 } 318 319 #ifdef __OPTIMIZE__ 320 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 321 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) 322 { 323 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, 324 __mask); 325 } 326 327 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 328 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) 329 { 330 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, 331 __mask); 332 } 333 #else 334 #define _mm256_shuffle_pd(A, B, N) \ 335 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \ 336 (__v4df)(__m256d)(B), (int)(N))) 337 338 #define _mm256_shuffle_ps(A, B, N) \ 339 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \ 340 (__v8sf)(__m256)(B), (int)(N))) 341 #endif 342 343 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 344 _mm256_sub_pd (__m256d __A, __m256d __B) 345 { 346 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B); 347 } 348 349 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 350 _mm256_sub_ps (__m256 __A, __m256 __B) 351 { 352 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B); 353 } 354 355 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 356 _mm256_xor_pd (__m256d __A, __m256d __B) 357 { 358 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); 359 } 360 361 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 362 _mm256_xor_ps (__m256 __A, __m256 __B) 363 { 364 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); 365 } 366 367 #ifdef __OPTIMIZE__ 368 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 369 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) 370 { 371 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); 372 } 373 374 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 375 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) 376 { 377 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); 378 } 379 380 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 381 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) 382 { 383 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, 384 __P); 385 } 386 387 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 388 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) 389 { 390 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, 391 __P); 392 } 393 394 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 395 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) 396 { 397 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); 398 } 399 400 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 401 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) 402 { 403 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); 404 } 405 #else 406 #define _mm_cmp_pd(X, Y, P) \ 407 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \ 408 (__v2df)(__m128d)(Y), (int)(P))) 409 410 #define _mm_cmp_ps(X, Y, P) \ 411 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \ 412 (__v4sf)(__m128)(Y), (int)(P))) 413 414 #define _mm256_cmp_pd(X, Y, P) \ 415 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \ 416 (__v4df)(__m256d)(Y), (int)(P))) 417 418 #define _mm256_cmp_ps(X, Y, P) \ 419 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \ 420 (__v8sf)(__m256)(Y), (int)(P))) 421 422 #define _mm_cmp_sd(X, Y, P) \ 423 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \ 424 (__v2df)(__m128d)(Y), (int)(P))) 425 426 #define _mm_cmp_ss(X, Y, P) \ 427 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \ 428 (__v4sf)(__m128)(Y), (int)(P))) 429 #endif 430 431 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 432 _mm256_cvtepi32_pd (__m128i __A) 433 { 434 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A); 435 } 436 437 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 438 _mm256_cvtepi32_ps (__m256i __A) 439 { 440 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A); 441 } 442 443 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 444 _mm256_cvtpd_ps (__m256d __A) 445 { 446 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A); 447 } 448 449 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 450 _mm256_cvtps_epi32 (__m256 __A) 451 { 452 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A); 453 } 454 455 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 456 _mm256_cvtps_pd (__m128 __A) 457 { 458 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A); 459 } 460 461 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 462 _mm256_cvttpd_epi32 (__m256d __A) 463 { 464 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A); 465 } 466 467 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 468 _mm256_cvtpd_epi32 (__m256d __A) 469 { 470 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A); 471 } 472 473 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 474 _mm256_cvttps_epi32 (__m256 __A) 475 { 476 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A); 477 } 478 479 #ifdef __OPTIMIZE__ 480 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 481 _mm256_extractf128_pd (__m256d __X, const int __N) 482 { 483 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); 484 } 485 486 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 487 _mm256_extractf128_ps (__m256 __X, const int __N) 488 { 489 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); 490 } 491 492 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 493 _mm256_extractf128_si256 (__m256i __X, const int __N) 494 { 495 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); 496 } 497 498 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 499 _mm256_extract_epi32 (__m256i __X, int const __N) 500 { 501 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); 502 return _mm_extract_epi32 (__Y, __N % 4); 503 } 504 505 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 506 _mm256_extract_epi16 (__m256i __X, int const __N) 507 { 508 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); 509 return _mm_extract_epi16 (__Y, __N % 8); 510 } 511 512 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 513 _mm256_extract_epi8 (__m256i __X, int const __N) 514 { 515 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); 516 return _mm_extract_epi8 (__Y, __N % 16); 517 } 518 519 #ifdef __x86_64__ 520 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 521 _mm256_extract_epi64 (__m256i __X, const int __N) 522 { 523 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); 524 return _mm_extract_epi64 (__Y, __N % 2); 525 } 526 #endif 527 #else 528 #define _mm256_extractf128_pd(X, N) \ 529 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \ 530 (int)(N))) 531 532 #define _mm256_extractf128_ps(X, N) \ 533 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \ 534 (int)(N))) 535 536 #define _mm256_extractf128_si256(X, N) \ 537 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \ 538 (int)(N))) 539 540 #define _mm256_extract_epi32(X, N) \ 541 (__extension__ \ 542 ({ \ 543 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ 544 _mm_extract_epi32 (__Y, (N) % 4); \ 545 })) 546 547 #define _mm256_extract_epi16(X, N) \ 548 (__extension__ \ 549 ({ \ 550 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ 551 _mm_extract_epi16 (__Y, (N) % 8); \ 552 })) 553 554 #define _mm256_extract_epi8(X, N) \ 555 (__extension__ \ 556 ({ \ 557 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ 558 _mm_extract_epi8 (__Y, (N) % 16); \ 559 })) 560 561 #ifdef __x86_64__ 562 #define _mm256_extract_epi64(X, N) \ 563 (__extension__ \ 564 ({ \ 565 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ 566 _mm_extract_epi64 (__Y, (N) % 2); \ 567 })) 568 #endif 569 #endif 570 571 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 572 _mm256_zeroall (void) 573 { 574 __builtin_ia32_vzeroall (); 575 } 576 577 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 578 _mm256_zeroupper (void) 579 { 580 __builtin_ia32_vzeroupper (); 581 } 582 583 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 584 _mm_permutevar_pd (__m128d __A, __m128i __C) 585 { 586 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A, 587 (__v2di)__C); 588 } 589 590 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 591 _mm256_permutevar_pd (__m256d __A, __m256i __C) 592 { 593 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A, 594 (__v4di)__C); 595 } 596 597 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 598 _mm_permutevar_ps (__m128 __A, __m128i __C) 599 { 600 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A, 601 (__v4si)__C); 602 } 603 604 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 605 _mm256_permutevar_ps (__m256 __A, __m256i __C) 606 { 607 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, 608 (__v8si)__C); 609 } 610 611 #ifdef __OPTIMIZE__ 612 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 613 _mm_permute_pd (__m128d __X, const int __C) 614 { 615 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); 616 } 617 618 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 619 _mm256_permute_pd (__m256d __X, const int __C) 620 { 621 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); 622 } 623 624 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 625 _mm_permute_ps (__m128 __X, const int __C) 626 { 627 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); 628 } 629 630 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 631 _mm256_permute_ps (__m256 __X, const int __C) 632 { 633 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); 634 } 635 #else 636 #define _mm_permute_pd(X, C) \ 637 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) 638 639 #define _mm256_permute_pd(X, C) \ 640 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) 641 642 #define _mm_permute_ps(X, C) \ 643 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) 644 645 #define _mm256_permute_ps(X, C) \ 646 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) 647 #endif 648 649 #ifdef __OPTIMIZE__ 650 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 651 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) 652 { 653 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, 654 (__v4df)__Y, 655 __C); 656 } 657 658 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 659 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) 660 { 661 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, 662 (__v8sf)__Y, 663 __C); 664 } 665 666 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 667 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) 668 { 669 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, 670 (__v8si)__Y, 671 __C); 672 } 673 #else 674 #define _mm256_permute2f128_pd(X, Y, C) \ 675 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \ 676 (__v4df)(__m256d)(Y), \ 677 (int)(C))) 678 679 #define _mm256_permute2f128_ps(X, Y, C) \ 680 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \ 681 (__v8sf)(__m256)(Y), \ 682 (int)(C))) 683 684 #define _mm256_permute2f128_si256(X, Y, C) \ 685 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \ 686 (__v8si)(__m256i)(Y), \ 687 (int)(C))) 688 #endif 689 690 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 691 _mm_broadcast_ss (float const *__X) 692 { 693 return (__m128) __builtin_ia32_vbroadcastss (__X); 694 } 695 696 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 697 _mm256_broadcast_sd (double const *__X) 698 { 699 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X); 700 } 701 702 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 703 _mm256_broadcast_ss (float const *__X) 704 { 705 return (__m256) __builtin_ia32_vbroadcastss256 (__X); 706 } 707 708 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 709 _mm256_broadcast_pd (__m128d const *__X) 710 { 711 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X); 712 } 713 714 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 715 _mm256_broadcast_ps (__m128 const *__X) 716 { 717 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); 718 } 719 720 #ifdef __OPTIMIZE__ 721 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 722 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) 723 { 724 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, 725 (__v2df)__Y, 726 __O); 727 } 728 729 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 730 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) 731 { 732 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, 733 (__v4sf)__Y, 734 __O); 735 } 736 737 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 738 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) 739 { 740 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, 741 (__v4si)__Y, 742 __O); 743 } 744 745 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 746 _mm256_insert_epi32 (__m256i __X, int __D, int const __N) 747 { 748 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); 749 __Y = _mm_insert_epi32 (__Y, __D, __N % 4); 750 return _mm256_insertf128_si256 (__X, __Y, __N >> 2); 751 } 752 753 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 754 _mm256_insert_epi16 (__m256i __X, int __D, int const __N) 755 { 756 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); 757 __Y = _mm_insert_epi16 (__Y, __D, __N % 8); 758 return _mm256_insertf128_si256 (__X, __Y, __N >> 3); 759 } 760 761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 762 _mm256_insert_epi8 (__m256i __X, int __D, int const __N) 763 { 764 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); 765 __Y = _mm_insert_epi8 (__Y, __D, __N % 16); 766 return _mm256_insertf128_si256 (__X, __Y, __N >> 4); 767 } 768 769 #ifdef __x86_64__ 770 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 771 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N) 772 { 773 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); 774 __Y = _mm_insert_epi64 (__Y, __D, __N % 2); 775 return _mm256_insertf128_si256 (__X, __Y, __N >> 1); 776 } 777 #endif 778 #else 779 #define _mm256_insertf128_pd(X, Y, O) \ 780 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \ 781 (__v2df)(__m128d)(Y), \ 782 (int)(O))) 783 784 #define _mm256_insertf128_ps(X, Y, O) \ 785 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \ 786 (__v4sf)(__m128)(Y), \ 787 (int)(O))) 788 789 #define _mm256_insertf128_si256(X, Y, O) \ 790 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \ 791 (__v4si)(__m128i)(Y), \ 792 (int)(O))) 793 794 #define _mm256_insert_epi32(X, D, N) \ 795 (__extension__ \ 796 ({ \ 797 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ 798 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \ 799 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \ 800 })) 801 802 #define _mm256_insert_epi16(X, D, N) \ 803 (__extension__ \ 804 ({ \ 805 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ 806 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \ 807 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \ 808 })) 809 810 #define _mm256_insert_epi8(X, D, N) \ 811 (__extension__ \ 812 ({ \ 813 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ 814 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \ 815 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \ 816 })) 817 818 #ifdef __x86_64__ 819 #define _mm256_insert_epi64(X, D, N) \ 820 (__extension__ \ 821 ({ \ 822 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ 823 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \ 824 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \ 825 })) 826 #endif 827 #endif 828 829 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 830 _mm256_load_pd (double const *__P) 831 { 832 return *(__m256d *)__P; 833 } 834 835 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 836 _mm256_store_pd (double *__P, __m256d __A) 837 { 838 *(__m256d *)__P = __A; 839 } 840 841 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 842 _mm256_load_ps (float const *__P) 843 { 844 return *(__m256 *)__P; 845 } 846 847 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 848 _mm256_store_ps (float *__P, __m256 __A) 849 { 850 *(__m256 *)__P = __A; 851 } 852 853 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 854 _mm256_loadu_pd (double const *__P) 855 { 856 return (__m256d) __builtin_ia32_loadupd256 (__P); 857 } 858 859 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 860 _mm256_storeu_pd (double *__P, __m256d __A) 861 { 862 __builtin_ia32_storeupd256 (__P, (__v4df)__A); 863 } 864 865 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 866 _mm256_loadu_ps (float const *__P) 867 { 868 return (__m256) __builtin_ia32_loadups256 (__P); 869 } 870 871 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 872 _mm256_storeu_ps (float *__P, __m256 __A) 873 { 874 __builtin_ia32_storeups256 (__P, (__v8sf)__A); 875 } 876 877 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 878 _mm256_load_si256 (__m256i const *__P) 879 { 880 return *__P; 881 } 882 883 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 884 _mm256_store_si256 (__m256i *__P, __m256i __A) 885 { 886 *__P = __A; 887 } 888 889 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 890 _mm256_loadu_si256 (__m256i const *__P) 891 { 892 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P); 893 } 894 895 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 896 _mm256_storeu_si256 (__m256i *__P, __m256i __A) 897 { 898 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A); 899 } 900 901 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 902 _mm_maskload_pd (double const *__P, __m128i __M) 903 { 904 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P, 905 (__v2di)__M); 906 } 907 908 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 909 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A) 910 { 911 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A); 912 } 913 914 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 915 _mm256_maskload_pd (double const *__P, __m256i __M) 916 { 917 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P, 918 (__v4di)__M); 919 } 920 921 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 922 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A) 923 { 924 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A); 925 } 926 927 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 928 _mm_maskload_ps (float const *__P, __m128i __M) 929 { 930 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P, 931 (__v4si)__M); 932 } 933 934 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 935 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A) 936 { 937 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); 938 } 939 940 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 941 _mm256_maskload_ps (float const *__P, __m256i __M) 942 { 943 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P, 944 (__v8si)__M); 945 } 946 947 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 948 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A) 949 { 950 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); 951 } 952 953 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 954 _mm256_movehdup_ps (__m256 __X) 955 { 956 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X); 957 } 958 959 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 960 _mm256_moveldup_ps (__m256 __X) 961 { 962 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X); 963 } 964 965 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 966 _mm256_movedup_pd (__m256d __X) 967 { 968 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X); 969 } 970 971 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 972 _mm256_lddqu_si256 (__m256i const *__P) 973 { 974 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P); 975 } 976 977 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 978 _mm256_stream_si256 (__m256i *__A, __m256i __B) 979 { 980 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B); 981 } 982 983 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 984 _mm256_stream_pd (double *__A, __m256d __B) 985 { 986 __builtin_ia32_movntpd256 (__A, (__v4df)__B); 987 } 988 989 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 990 _mm256_stream_ps (float *__P, __m256 __A) 991 { 992 __builtin_ia32_movntps256 (__P, (__v8sf)__A); 993 } 994 995 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 996 _mm256_rcp_ps (__m256 __A) 997 { 998 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A); 999 } 1000 1001 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1002 _mm256_rsqrt_ps (__m256 __A) 1003 { 1004 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A); 1005 } 1006 1007 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1008 _mm256_sqrt_pd (__m256d __A) 1009 { 1010 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A); 1011 } 1012 1013 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1014 _mm256_sqrt_ps (__m256 __A) 1015 { 1016 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); 1017 } 1018 1019 #ifdef __OPTIMIZE__ 1020 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1021 _mm256_round_pd (__m256d __V, const int __M) 1022 { 1023 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); 1024 } 1025 1026 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1027 _mm256_round_ps (__m256 __V, const int __M) 1028 { 1029 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); 1030 } 1031 #else 1032 #define _mm256_round_pd(V, M) \ 1033 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) 1034 1035 #define _mm256_round_ps(V, M) \ 1036 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) 1037 #endif 1038 1039 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) 1040 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) 1041 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) 1042 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR) 1043 1044 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1045 _mm256_unpackhi_pd (__m256d __A, __m256d __B) 1046 { 1047 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B); 1048 } 1049 1050 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1051 _mm256_unpacklo_pd (__m256d __A, __m256d __B) 1052 { 1053 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B); 1054 } 1055 1056 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1057 _mm256_unpackhi_ps (__m256 __A, __m256 __B) 1058 { 1059 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B); 1060 } 1061 1062 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1063 _mm256_unpacklo_ps (__m256 __A, __m256 __B) 1064 { 1065 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B); 1066 } 1067 1068 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1069 _mm_testz_pd (__m128d __M, __m128d __V) 1070 { 1071 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V); 1072 } 1073 1074 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1075 _mm_testc_pd (__m128d __M, __m128d __V) 1076 { 1077 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V); 1078 } 1079 1080 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1081 _mm_testnzc_pd (__m128d __M, __m128d __V) 1082 { 1083 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V); 1084 } 1085 1086 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1087 _mm_testz_ps (__m128 __M, __m128 __V) 1088 { 1089 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V); 1090 } 1091 1092 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1093 _mm_testc_ps (__m128 __M, __m128 __V) 1094 { 1095 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V); 1096 } 1097 1098 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1099 _mm_testnzc_ps (__m128 __M, __m128 __V) 1100 { 1101 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V); 1102 } 1103 1104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1105 _mm256_testz_pd (__m256d __M, __m256d __V) 1106 { 1107 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V); 1108 } 1109 1110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1111 _mm256_testc_pd (__m256d __M, __m256d __V) 1112 { 1113 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V); 1114 } 1115 1116 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1117 _mm256_testnzc_pd (__m256d __M, __m256d __V) 1118 { 1119 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V); 1120 } 1121 1122 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1123 _mm256_testz_ps (__m256 __M, __m256 __V) 1124 { 1125 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V); 1126 } 1127 1128 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1129 _mm256_testc_ps (__m256 __M, __m256 __V) 1130 { 1131 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V); 1132 } 1133 1134 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1135 _mm256_testnzc_ps (__m256 __M, __m256 __V) 1136 { 1137 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V); 1138 } 1139 1140 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1141 _mm256_testz_si256 (__m256i __M, __m256i __V) 1142 { 1143 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V); 1144 } 1145 1146 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1147 _mm256_testc_si256 (__m256i __M, __m256i __V) 1148 { 1149 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V); 1150 } 1151 1152 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1153 _mm256_testnzc_si256 (__m256i __M, __m256i __V) 1154 { 1155 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V); 1156 } 1157 1158 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1159 _mm256_movemask_pd (__m256d __A) 1160 { 1161 return __builtin_ia32_movmskpd256 ((__v4df)__A); 1162 } 1163 1164 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1165 _mm256_movemask_ps (__m256 __A) 1166 { 1167 return __builtin_ia32_movmskps256 ((__v8sf)__A); 1168 } 1169 1170 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1171 _mm256_undefined_pd (void) 1172 { 1173 __m256d __Y = __Y; 1174 return __Y; 1175 } 1176 1177 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1178 _mm256_undefined_ps (void) 1179 { 1180 __m256 __Y = __Y; 1181 return __Y; 1182 } 1183 1184 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1185 _mm256_undefined_si256 (void) 1186 { 1187 __m256i __Y = __Y; 1188 return __Y; 1189 } 1190 1191 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1192 _mm256_setzero_pd (void) 1193 { 1194 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; 1195 } 1196 1197 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1198 _mm256_setzero_ps (void) 1199 { 1200 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0, 1201 0.0, 0.0, 0.0, 0.0 }; 1202 } 1203 1204 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1205 _mm256_setzero_si256 (void) 1206 { 1207 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; 1208 } 1209 1210 /* Create the vector [A B C D]. */ 1211 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1212 _mm256_set_pd (double __A, double __B, double __C, double __D) 1213 { 1214 return __extension__ (__m256d){ __D, __C, __B, __A }; 1215 } 1216 1217 /* Create the vector [A B C D E F G H]. */ 1218 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1219 _mm256_set_ps (float __A, float __B, float __C, float __D, 1220 float __E, float __F, float __G, float __H) 1221 { 1222 return __extension__ (__m256){ __H, __G, __F, __E, 1223 __D, __C, __B, __A }; 1224 } 1225 1226 /* Create the vector [A B C D E F G H]. */ 1227 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1228 _mm256_set_epi32 (int __A, int __B, int __C, int __D, 1229 int __E, int __F, int __G, int __H) 1230 { 1231 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E, 1232 __D, __C, __B, __A }; 1233 } 1234 1235 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1236 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12, 1237 short __q11, short __q10, short __q09, short __q08, 1238 short __q07, short __q06, short __q05, short __q04, 1239 short __q03, short __q02, short __q01, short __q00) 1240 { 1241 return __extension__ (__m256i)(__v16hi){ 1242 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 1243 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 1244 }; 1245 } 1246 1247 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1248 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28, 1249 char __q27, char __q26, char __q25, char __q24, 1250 char __q23, char __q22, char __q21, char __q20, 1251 char __q19, char __q18, char __q17, char __q16, 1252 char __q15, char __q14, char __q13, char __q12, 1253 char __q11, char __q10, char __q09, char __q08, 1254 char __q07, char __q06, char __q05, char __q04, 1255 char __q03, char __q02, char __q01, char __q00) 1256 { 1257 return __extension__ (__m256i)(__v32qi){ 1258 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 1259 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, 1260 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, 1261 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 1262 }; 1263 } 1264 1265 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1266 _mm256_set_epi64x (long long __A, long long __B, long long __C, 1267 long long __D) 1268 { 1269 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A }; 1270 } 1271 1272 /* Create a vector with all elements equal to A. */ 1273 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1274 _mm256_set1_pd (double __A) 1275 { 1276 return __extension__ (__m256d){ __A, __A, __A, __A }; 1277 } 1278 1279 /* Create a vector with all elements equal to A. */ 1280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1281 _mm256_set1_ps (float __A) 1282 { 1283 return __extension__ (__m256){ __A, __A, __A, __A, 1284 __A, __A, __A, __A }; 1285 } 1286 1287 /* Create a vector with all elements equal to A. */ 1288 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1289 _mm256_set1_epi32 (int __A) 1290 { 1291 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A, 1292 __A, __A, __A, __A }; 1293 } 1294 1295 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1296 _mm256_set1_epi16 (short __A) 1297 { 1298 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A, 1299 __A, __A, __A, __A, __A, __A, __A, __A); 1300 } 1301 1302 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1303 _mm256_set1_epi8 (char __A) 1304 { 1305 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 1306 __A, __A, __A, __A, __A, __A, __A, __A, 1307 __A, __A, __A, __A, __A, __A, __A, __A, 1308 __A, __A, __A, __A, __A, __A, __A, __A); 1309 } 1310 1311 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1312 _mm256_set1_epi64x (long long __A) 1313 { 1314 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A }; 1315 } 1316 1317 /* Create vectors of elements in the reversed order from the 1318 _mm256_set_XXX functions. */ 1319 1320 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1321 _mm256_setr_pd (double __A, double __B, double __C, double __D) 1322 { 1323 return _mm256_set_pd (__D, __C, __B, __A); 1324 } 1325 1326 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1327 _mm256_setr_ps (float __A, float __B, float __C, float __D, 1328 float __E, float __F, float __G, float __H) 1329 { 1330 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A); 1331 } 1332 1333 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1334 _mm256_setr_epi32 (int __A, int __B, int __C, int __D, 1335 int __E, int __F, int __G, int __H) 1336 { 1337 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A); 1338 } 1339 1340 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1341 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12, 1342 short __q11, short __q10, short __q09, short __q08, 1343 short __q07, short __q06, short __q05, short __q04, 1344 short __q03, short __q02, short __q01, short __q00) 1345 { 1346 return _mm256_set_epi16 (__q00, __q01, __q02, __q03, 1347 __q04, __q05, __q06, __q07, 1348 __q08, __q09, __q10, __q11, 1349 __q12, __q13, __q14, __q15); 1350 } 1351 1352 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1353 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28, 1354 char __q27, char __q26, char __q25, char __q24, 1355 char __q23, char __q22, char __q21, char __q20, 1356 char __q19, char __q18, char __q17, char __q16, 1357 char __q15, char __q14, char __q13, char __q12, 1358 char __q11, char __q10, char __q09, char __q08, 1359 char __q07, char __q06, char __q05, char __q04, 1360 char __q03, char __q02, char __q01, char __q00) 1361 { 1362 return _mm256_set_epi8 (__q00, __q01, __q02, __q03, 1363 __q04, __q05, __q06, __q07, 1364 __q08, __q09, __q10, __q11, 1365 __q12, __q13, __q14, __q15, 1366 __q16, __q17, __q18, __q19, 1367 __q20, __q21, __q22, __q23, 1368 __q24, __q25, __q26, __q27, 1369 __q28, __q29, __q30, __q31); 1370 } 1371 1372 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1373 _mm256_setr_epi64x (long long __A, long long __B, long long __C, 1374 long long __D) 1375 { 1376 return _mm256_set_epi64x (__D, __C, __B, __A); 1377 } 1378 1379 /* Casts between various SP, DP, INT vector types. Note that these do no 1380 conversion of values, they just change the type. */ 1381 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1382 _mm256_castpd_ps (__m256d __A) 1383 { 1384 return (__m256) __A; 1385 } 1386 1387 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1388 _mm256_castpd_si256 (__m256d __A) 1389 { 1390 return (__m256i) __A; 1391 } 1392 1393 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1394 _mm256_castps_pd (__m256 __A) 1395 { 1396 return (__m256d) __A; 1397 } 1398 1399 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1400 _mm256_castps_si256(__m256 __A) 1401 { 1402 return (__m256i) __A; 1403 } 1404 1405 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1406 _mm256_castsi256_ps (__m256i __A) 1407 { 1408 return (__m256) __A; 1409 } 1410 1411 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1412 _mm256_castsi256_pd (__m256i __A) 1413 { 1414 return (__m256d) __A; 1415 } 1416 1417 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1418 _mm256_castpd256_pd128 (__m256d __A) 1419 { 1420 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A); 1421 } 1422 1423 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1424 _mm256_castps256_ps128 (__m256 __A) 1425 { 1426 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A); 1427 } 1428 1429 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1430 _mm256_castsi256_si128 (__m256i __A) 1431 { 1432 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); 1433 } 1434 1435 /* When cast is done from a 128 to 256-bit type, the low 128 bits of 1436 the 256-bit result contain source parameter value and the upper 128 1437 bits of the result are undefined. Those intrinsics shouldn't 1438 generate any extra moves. */ 1439 1440 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1441 _mm256_castpd128_pd256 (__m128d __A) 1442 { 1443 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A); 1444 } 1445 1446 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1447 _mm256_castps128_ps256 (__m128 __A) 1448 { 1449 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A); 1450 } 1451 1452 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1453 _mm256_castsi128_si256 (__m128i __A) 1454 { 1455 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); 1456 } 1457 1458 #ifdef __DISABLE_AVX__ 1459 #undef __DISABLE_AVX__ 1460 #pragma GCC pop_options 1461 #endif /* __DISABLE_AVX__ */ 1462 1463 #endif /* _AVXINTRIN_H_INCLUDED */ 1464