1 /* Copyright (C) 2008, 2009 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 11.0. */ 26 27 #ifndef _IMMINTRIN_H_INCLUDED 28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 29 #endif 30 31 /* Internal data types for implementing the intrinsics. */ 32 typedef double __v4df __attribute__ ((__vector_size__ (32))); 33 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 34 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 35 typedef int __v8si __attribute__ ((__vector_size__ (32))); 36 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 37 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 38 39 /* The Intel API is flexible enough that we must allow aliasing with other 40 vector types, and their scalar components. */ 41 typedef float __m256 __attribute__ ((__vector_size__ (32), 42 __may_alias__)); 43 typedef long long __m256i __attribute__ ((__vector_size__ (32), 44 __may_alias__)); 45 typedef double __m256d __attribute__ ((__vector_size__ (32), 46 __may_alias__)); 47 48 /* Compare predicates for scalar and packed compare intrinsics. */ 49 50 /* Equal (ordered, non-signaling) */ 51 #define _CMP_EQ_OQ 0x00 52 /* Less-than (ordered, signaling) */ 53 #define _CMP_LT_OS 0x01 54 /* Less-than-or-equal (ordered, signaling) */ 55 #define _CMP_LE_OS 0x02 56 /* Unordered (non-signaling) */ 57 #define _CMP_UNORD_Q 0x03 58 /* Not-equal (unordered, non-signaling) */ 59 #define _CMP_NEQ_UQ 0x04 60 /* Not-less-than (unordered, signaling) */ 61 #define _CMP_NLT_US 0x05 62 /* Not-less-than-or-equal (unordered, signaling) */ 63 #define _CMP_NLE_US 0x06 64 /* Ordered (nonsignaling) */ 65 #define _CMP_ORD_Q 0x07 66 /* Equal (unordered, non-signaling) */ 67 #define _CMP_EQ_UQ 0x08 68 /* Not-greater-than-or-equal (unordered, signaling) */ 69 #define _CMP_NGE_US 0x09 70 /* Not-greater-than (unordered, signaling) */ 71 #define _CMP_NGT_US 0x0a 72 /* False (ordered, non-signaling) */ 73 #define _CMP_FALSE_OQ 0x0b 74 /* Not-equal (ordered, non-signaling) */ 75 #define _CMP_NEQ_OQ 0x0c 76 /* Greater-than-or-equal (ordered, signaling) */ 77 #define _CMP_GE_OS 0x0d 78 /* Greater-than (ordered, signaling) */ 79 #define _CMP_GT_OS 0x0e 80 /* True (unordered, non-signaling) */ 81 #define _CMP_TRUE_UQ 0x0f 82 /* Equal (ordered, signaling) */ 83 #define _CMP_EQ_OS 0x10 84 /* Less-than (ordered, non-signaling) */ 85 #define _CMP_LT_OQ 0x11 86 /* Less-than-or-equal (ordered, non-signaling) */ 87 #define _CMP_LE_OQ 0x12 88 /* Unordered (signaling) */ 89 #define _CMP_UNORD_S 0x13 90 /* Not-equal (unordered, signaling) */ 91 #define _CMP_NEQ_US 0x14 92 /* Not-less-than (unordered, non-signaling) */ 93 #define _CMP_NLT_UQ 0x15 94 /* Not-less-than-or-equal (unordered, non-signaling) */ 95 #define _CMP_NLE_UQ 0x16 96 /* Ordered (signaling) */ 97 #define _CMP_ORD_S 0x17 98 /* Equal (unordered, signaling) */ 99 #define _CMP_EQ_US 0x18 100 /* Not-greater-than-or-equal (unordered, non-signaling) */ 101 #define _CMP_NGE_UQ 0x19 102 /* Not-greater-than (unordered, non-signaling) */ 103 #define _CMP_NGT_UQ 0x1a 104 /* False (ordered, signaling) */ 105 #define _CMP_FALSE_OS 0x1b 106 /* Not-equal (ordered, signaling) */ 107 #define _CMP_NEQ_OS 0x1c 108 /* Greater-than-or-equal (ordered, non-signaling) */ 109 #define _CMP_GE_OQ 0x1d 110 /* Greater-than (ordered, non-signaling) */ 111 #define _CMP_GT_OQ 0x1e 112 /* True (unordered, signaling) */ 113 #define _CMP_TRUE_US 0x1f 114 115 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 116 _mm256_add_pd (__m256d __A, __m256d __B) 117 { 118 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B); 119 } 120 121 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 122 _mm256_add_ps (__m256 __A, __m256 __B) 123 { 124 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B); 125 } 126 127 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 128 _mm256_addsub_pd (__m256d __A, __m256d __B) 129 { 130 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); 131 } 132 133 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 134 _mm256_addsub_ps (__m256 __A, __m256 __B) 135 { 136 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B); 137 } 138 139 140 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 141 _mm256_and_pd (__m256d __A, __m256d __B) 142 { 143 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B); 144 } 145 146 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147 _mm256_and_ps (__m256 __A, __m256 __B) 148 { 149 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B); 150 } 151 152 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 153 _mm256_andnot_pd (__m256d __A, __m256d __B) 154 { 155 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B); 156 } 157 158 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 159 _mm256_andnot_ps (__m256 __A, __m256 __B) 160 { 161 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); 162 } 163 164 /* Double/single precision floating point blend instructions - select 165 data from 2 sources using constant/variable mask. */ 166 167 #ifdef __OPTIMIZE__ 168 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 169 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) 170 { 171 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, 172 (__v4df)__Y, 173 __M); 174 } 175 176 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 177 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) 178 { 179 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, 180 (__v8sf)__Y, 181 __M); 182 } 183 #else 184 #define _mm256_blend_pd(X, Y, M) \ 185 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \ 186 (__v4df)(__m256d)(Y), (int)(M))) 187 188 #define _mm256_blend_ps(X, Y, M) \ 189 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \ 190 (__v8sf)(__m256)(Y), (int)(M))) 191 #endif 192 193 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 194 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) 195 { 196 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X, 197 (__v4df)__Y, 198 (__v4df)__M); 199 } 200 201 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 202 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) 203 { 204 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, 205 (__v8sf)__Y, 206 (__v8sf)__M); 207 } 208 209 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 210 _mm256_div_pd (__m256d __A, __m256d __B) 211 { 212 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B); 213 } 214 215 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 216 _mm256_div_ps (__m256 __A, __m256 __B) 217 { 218 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B); 219 } 220 221 /* Dot product instructions with mask-defined summing and zeroing parts 222 of result. */ 223 224 #ifdef __OPTIMIZE__ 225 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 226 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) 227 { 228 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, 229 (__v8sf)__Y, 230 __M); 231 } 232 #else 233 #define _mm256_dp_ps(X, Y, M) \ 234 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \ 235 (__v8sf)(__m256)(Y), (int)(M))) 236 #endif 237 238 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 239 _mm256_hadd_pd (__m256d __X, __m256d __Y) 240 { 241 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y); 242 } 243 244 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 245 _mm256_hadd_ps (__m256 __X, __m256 __Y) 246 { 247 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y); 248 } 249 250 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 251 _mm256_hsub_pd (__m256d __X, __m256d __Y) 252 { 253 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y); 254 } 255 256 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 257 _mm256_hsub_ps (__m256 __X, __m256 __Y) 258 { 259 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y); 260 } 261 262 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 263 _mm256_max_pd (__m256d __A, __m256d __B) 264 { 265 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B); 266 } 267 268 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 269 _mm256_max_ps (__m256 __A, __m256 __B) 270 { 271 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B); 272 } 273 274 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 275 _mm256_min_pd (__m256d __A, __m256d __B) 276 { 277 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B); 278 } 279 280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 281 _mm256_min_ps (__m256 __A, __m256 __B) 282 { 283 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); 284 } 285 286 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 287 _mm256_mul_pd (__m256d __A, __m256d __B) 288 { 289 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B); 290 } 291 292 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 293 _mm256_mul_ps (__m256 __A, __m256 __B) 294 { 295 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B); 296 } 297 298 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 299 _mm256_or_pd (__m256d __A, __m256d __B) 300 { 301 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); 302 } 303 304 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 305 _mm256_or_ps (__m256 __A, __m256 __B) 306 { 307 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); 308 } 309 310 #ifdef __OPTIMIZE__ 311 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 312 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) 313 { 314 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, 315 __mask); 316 } 317 318 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 319 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) 320 { 321 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, 322 __mask); 323 } 324 #else 325 #define _mm256_shuffle_pd(A, B, N) \ 326 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \ 327 (__v4df)(__m256d)(B), (int)(N))) 328 329 #define _mm256_shuffle_ps(A, B, N) \ 330 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \ 331 (__v8sf)(__m256)(B), (int)(N))) 332 #endif 333 334 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 335 _mm256_sub_pd (__m256d __A, __m256d __B) 336 { 337 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B); 338 } 339 340 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 341 _mm256_sub_ps (__m256 __A, __m256 __B) 342 { 343 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B); 344 } 345 346 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 347 _mm256_xor_pd (__m256d __A, __m256d __B) 348 { 349 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); 350 } 351 352 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 353 _mm256_xor_ps (__m256 __A, __m256 __B) 354 { 355 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); 356 } 357 358 #ifdef __OPTIMIZE__ 359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 360 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) 361 { 362 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); 363 } 364 365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 366 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) 367 { 368 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); 369 } 370 371 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 372 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) 373 { 374 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, 375 __P); 376 } 377 378 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 379 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) 380 { 381 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, 382 __P); 383 } 384 385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 386 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) 387 { 388 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); 389 } 390 391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 392 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) 393 { 394 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); 395 } 396 #else 397 #define _mm_cmp_pd(X, Y, P) \ 398 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \ 399 (__v2df)(__m128d)(Y), (int)(P))) 400 401 #define _mm_cmp_ps(X, Y, P) \ 402 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \ 403 (__v4sf)(__m128)(Y), (int)(P))) 404 405 #define _mm256_cmp_pd(X, Y, P) \ 406 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \ 407 (__v4df)(__m256d)(Y), (int)(P))) 408 409 #define _mm256_cmp_ps(X, Y, P) \ 410 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \ 411 (__v8sf)(__m256)(Y), (int)(P))) 412 413 #define _mm_cmp_sd(X, Y, P) \ 414 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \ 415 (__v2df)(__m128d)(Y), (int)(P))) 416 417 #define _mm_cmp_ss(X, Y, P) \ 418 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \ 419 (__v4sf)(__m128)(Y), (int)(P))) 420 #endif 421 422 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 423 _mm256_cvtepi32_pd (__m128i __A) 424 { 425 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A); 426 } 427 428 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 429 _mm256_cvtepi32_ps (__m256i __A) 430 { 431 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A); 432 } 433 434 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 435 _mm256_cvtpd_ps (__m256d __A) 436 { 437 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A); 438 } 439 440 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 441 _mm256_cvtps_epi32 (__m256 __A) 442 { 443 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A); 444 } 445 446 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 447 _mm256_cvtps_pd (__m128 __A) 448 { 449 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A); 450 } 451 452 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 453 _mm256_cvttpd_epi32 (__m256d __A) 454 { 455 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A); 456 } 457 458 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 459 _mm256_cvtpd_epi32 (__m256d __A) 460 { 461 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A); 462 } 463 464 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 465 _mm256_cvttps_epi32 (__m256 __A) 466 { 467 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A); 468 } 469 470 #ifdef __OPTIMIZE__ 471 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 472 _mm256_extractf128_pd (__m256d __X, const int __N) 473 { 474 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); 475 } 476 477 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 478 _mm256_extractf128_ps (__m256 __X, const int __N) 479 { 480 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); 481 } 482 483 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 484 _mm256_extractf128_si256 (__m256i __X, const int __N) 485 { 486 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); 487 } 488 489 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 490 _mm256_extract_epi32 (__m256i __X, int const __N) 491 { 492 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); 493 return _mm_extract_epi32 (__Y, __N % 4); 494 } 495 496 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 497 _mm256_extract_epi16 (__m256i __X, int const __N) 498 { 499 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); 500 return _mm_extract_epi16 (__Y, __N % 8); 501 } 502 503 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 504 _mm256_extract_epi8 (__m256i __X, int const __N) 505 { 506 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); 507 return _mm_extract_epi8 (__Y, __N % 16); 508 } 509 510 #ifdef __x86_64__ 511 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 512 _mm256_extract_epi64 (__m256i __X, const int __N) 513 { 514 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); 515 return _mm_extract_epi64 (__Y, __N % 2); 516 } 517 #endif 518 #else 519 #define _mm256_extractf128_pd(X, N) \ 520 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \ 521 (int)(N))) 522 523 #define _mm256_extractf128_ps(X, N) \ 524 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \ 525 (int)(N))) 526 527 #define _mm256_extractf128_si256(X, N) \ 528 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \ 529 (int)(N))) 530 531 #define _mm256_extract_epi32(X, N) \ 532 (__extension__ \ 533 ({ \ 534 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ 535 _mm_extract_epi32 (__Y, (N) % 4); \ 536 })) 537 538 #define _mm256_extract_epi16(X, N) \ 539 (__extension__ \ 540 ({ \ 541 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ 542 _mm_extract_epi16 (__Y, (N) % 8); \ 543 })) 544 545 #define _mm256_extract_epi8(X, N) \ 546 (__extension__ \ 547 ({ \ 548 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ 549 _mm_extract_epi8 (__Y, (N) % 16); \ 550 })) 551 552 #ifdef __x86_64__ 553 #define _mm256_extract_epi64(X, N) \ 554 (__extension__ \ 555 ({ \ 556 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ 557 _mm_extract_epi64 (__Y, (N) % 2); \ 558 })) 559 #endif 560 #endif 561 562 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 563 _mm256_zeroall (void) 564 { 565 __builtin_ia32_vzeroall (); 566 } 567 568 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 569 _mm256_zeroupper (void) 570 { 571 __builtin_ia32_vzeroupper (); 572 } 573 574 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 575 _mm_permutevar_pd (__m128d __A, __m128i __C) 576 { 577 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A, 578 (__v2di)__C); 579 } 580 581 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 582 _mm256_permutevar_pd (__m256d __A, __m256i __C) 583 { 584 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A, 585 (__v4di)__C); 586 } 587 588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 589 _mm_permutevar_ps (__m128 __A, __m128i __C) 590 { 591 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A, 592 (__v4si)__C); 593 } 594 595 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 596 _mm256_permutevar_ps (__m256 __A, __m256i __C) 597 { 598 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, 599 (__v8si)__C); 600 } 601 602 #ifdef __OPTIMIZE__ 603 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 604 _mm_permute_pd (__m128d __X, const int __C) 605 { 606 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); 607 } 608 609 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 610 _mm256_permute_pd (__m256d __X, const int __C) 611 { 612 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); 613 } 614 615 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 616 _mm_permute_ps (__m128 __X, const int __C) 617 { 618 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); 619 } 620 621 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 622 _mm256_permute_ps (__m256 __X, const int __C) 623 { 624 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); 625 } 626 #else 627 #define _mm_permute_pd(X, C) \ 628 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) 629 630 #define _mm256_permute_pd(X, C) \ 631 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) 632 633 #define _mm_permute_ps(X, C) \ 634 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) 635 636 #define _mm256_permute_ps(X, C) \ 637 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) 638 #endif 639 640 #ifdef __OPTIMIZE__ 641 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 642 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) 643 { 644 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, 645 (__v4df)__Y, 646 __C); 647 } 648 649 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 650 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) 651 { 652 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, 653 (__v8sf)__Y, 654 __C); 655 } 656 657 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 658 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) 659 { 660 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, 661 (__v8si)__Y, 662 __C); 663 } 664 #else 665 #define _mm256_permute2f128_pd(X, Y, C) \ 666 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \ 667 (__v4df)(__m256d)(Y), \ 668 (int)(C))) 669 670 #define _mm256_permute2f128_ps(X, Y, C) \ 671 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \ 672 (__v8sf)(__m256)(Y), \ 673 (int)(C))) 674 675 #define _mm256_permute2f128_si256(X, Y, C) \ 676 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \ 677 (__v8si)(__m256i)(Y), \ 678 (int)(C))) 679 #endif 680 681 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 682 _mm_broadcast_ss (float const *__X) 683 { 684 return (__m128) __builtin_ia32_vbroadcastss (__X); 685 } 686 687 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 688 _mm256_broadcast_sd (double const *__X) 689 { 690 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X); 691 } 692 693 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 694 _mm256_broadcast_ss (float const *__X) 695 { 696 return (__m256) __builtin_ia32_vbroadcastss256 (__X); 697 } 698 699 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 700 _mm256_broadcast_pd (__m128d const *__X) 701 { 702 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X); 703 } 704 705 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 706 _mm256_broadcast_ps (__m128 const *__X) 707 { 708 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); 709 } 710 711 #ifdef __OPTIMIZE__ 712 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 713 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) 714 { 715 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, 716 (__v2df)__Y, 717 __O); 718 } 719 720 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 721 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) 722 { 723 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, 724 (__v4sf)__Y, 725 __O); 726 } 727 728 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 729 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) 730 { 731 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, 732 (__v4si)__Y, 733 __O); 734 } 735 736 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 737 _mm256_insert_epi32 (__m256i __X, int __D, int const __N) 738 { 739 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); 740 __Y = _mm_insert_epi32 (__Y, __D, __N % 4); 741 return _mm256_insertf128_si256 (__X, __Y, __N >> 2); 742 } 743 744 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 745 _mm256_insert_epi16 (__m256i __X, int __D, int const __N) 746 { 747 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); 748 __Y = _mm_insert_epi16 (__Y, __D, __N % 8); 749 return _mm256_insertf128_si256 (__X, __Y, __N >> 3); 750 } 751 752 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 753 _mm256_insert_epi8 (__m256i __X, int __D, int const __N) 754 { 755 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); 756 __Y = _mm_insert_epi8 (__Y, __D, __N % 16); 757 return _mm256_insertf128_si256 (__X, __Y, __N >> 4); 758 } 759 760 #ifdef __x86_64__ 761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 762 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N) 763 { 764 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); 765 __Y = _mm_insert_epi64 (__Y, __D, __N % 2); 766 return _mm256_insertf128_si256 (__X, __Y, __N >> 1); 767 } 768 #endif 769 #else 770 #define _mm256_insertf128_pd(X, Y, O) \ 771 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \ 772 (__v2df)(__m128d)(Y), \ 773 (int)(O))) 774 775 #define _mm256_insertf128_ps(X, Y, O) \ 776 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \ 777 (__v4sf)(__m128)(Y), \ 778 (int)(O))) 779 780 #define _mm256_insertf128_si256(X, Y, O) \ 781 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \ 782 (__v4si)(__m128i)(Y), \ 783 (int)(O))) 784 785 #define _mm256_insert_epi32(X, D, N) \ 786 (__extension__ \ 787 ({ \ 788 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ 789 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \ 790 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \ 791 })) 792 793 #define _mm256_insert_epi16(X, D, N) \ 794 (__extension__ \ 795 ({ \ 796 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ 797 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \ 798 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \ 799 })) 800 801 #define _mm256_insert_epi8(X, D, N) \ 802 (__extension__ \ 803 ({ \ 804 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ 805 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \ 806 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \ 807 })) 808 809 #ifdef __x86_64__ 810 #define _mm256_insert_epi64(X, D, N) \ 811 (__extension__ \ 812 ({ \ 813 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ 814 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \ 815 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \ 816 })) 817 #endif 818 #endif 819 820 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 821 _mm256_load_pd (double const *__P) 822 { 823 return *(__m256d *)__P; 824 } 825 826 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 827 _mm256_store_pd (double *__P, __m256d __A) 828 { 829 *(__m256d *)__P = __A; 830 } 831 832 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 833 _mm256_load_ps (float const *__P) 834 { 835 return *(__m256 *)__P; 836 } 837 838 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 839 _mm256_store_ps (float *__P, __m256 __A) 840 { 841 *(__m256 *)__P = __A; 842 } 843 844 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 845 _mm256_loadu_pd (double const *__P) 846 { 847 return (__m256d) __builtin_ia32_loadupd256 (__P); 848 } 849 850 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 851 _mm256_storeu_pd (double *__P, __m256d __A) 852 { 853 __builtin_ia32_storeupd256 (__P, (__v4df)__A); 854 } 855 856 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 857 _mm256_loadu_ps (float const *__P) 858 { 859 return (__m256) __builtin_ia32_loadups256 (__P); 860 } 861 862 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 863 _mm256_storeu_ps (float *__P, __m256 __A) 864 { 865 __builtin_ia32_storeups256 (__P, (__v8sf)__A); 866 } 867 868 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 869 _mm256_load_si256 (__m256i const *__P) 870 { 871 return *__P; 872 } 873 874 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 875 _mm256_store_si256 (__m256i *__P, __m256i __A) 876 { 877 *__P = __A; 878 } 879 880 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 881 _mm256_loadu_si256 (__m256i const *__P) 882 { 883 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P); 884 } 885 886 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 887 _mm256_storeu_si256 (__m256i *__P, __m256i __A) 888 { 889 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A); 890 } 891 892 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 893 _mm_maskload_pd (double const *__P, __m128i __M) 894 { 895 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P, 896 (__v2di)__M); 897 } 898 899 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 900 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A) 901 { 902 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A); 903 } 904 905 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 906 _mm256_maskload_pd (double const *__P, __m256i __M) 907 { 908 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P, 909 (__v4di)__M); 910 } 911 912 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 913 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A) 914 { 915 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A); 916 } 917 918 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 919 _mm_maskload_ps (float const *__P, __m128i __M) 920 { 921 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P, 922 (__v4si)__M); 923 } 924 925 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 926 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A) 927 { 928 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); 929 } 930 931 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 932 _mm256_maskload_ps (float const *__P, __m256i __M) 933 { 934 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P, 935 (__v8si)__M); 936 } 937 938 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 939 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A) 940 { 941 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); 942 } 943 944 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 945 _mm256_movehdup_ps (__m256 __X) 946 { 947 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X); 948 } 949 950 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 951 _mm256_moveldup_ps (__m256 __X) 952 { 953 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X); 954 } 955 956 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 957 _mm256_movedup_pd (__m256d __X) 958 { 959 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X); 960 } 961 962 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 963 _mm256_lddqu_si256 (__m256i const *__P) 964 { 965 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P); 966 } 967 968 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 969 _mm256_stream_si256 (__m256i *__A, __m256i __B) 970 { 971 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B); 972 } 973 974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 975 _mm256_stream_pd (double *__A, __m256d __B) 976 { 977 __builtin_ia32_movntpd256 (__A, (__v4df)__B); 978 } 979 980 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 981 _mm256_stream_ps (float *__P, __m256 __A) 982 { 983 __builtin_ia32_movntps256 (__P, (__v8sf)__A); 984 } 985 986 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 987 _mm256_rcp_ps (__m256 __A) 988 { 989 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A); 990 } 991 992 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 993 _mm256_rsqrt_ps (__m256 __A) 994 { 995 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A); 996 } 997 998 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 999 _mm256_sqrt_pd (__m256d __A) 1000 { 1001 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A); 1002 } 1003 1004 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1005 _mm256_sqrt_ps (__m256 __A) 1006 { 1007 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); 1008 } 1009 1010 #ifdef __OPTIMIZE__ 1011 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1012 _mm256_round_pd (__m256d __V, const int __M) 1013 { 1014 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); 1015 } 1016 1017 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1018 _mm256_round_ps (__m256 __V, const int __M) 1019 { 1020 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); 1021 } 1022 #else 1023 #define _mm256_round_pd(V, M) \ 1024 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) 1025 1026 #define _mm256_round_ps(V, M) \ 1027 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) 1028 #endif 1029 1030 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) 1031 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) 1032 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) 1033 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR) 1034 1035 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1036 _mm256_unpackhi_pd (__m256d __A, __m256d __B) 1037 { 1038 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B); 1039 } 1040 1041 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1042 _mm256_unpacklo_pd (__m256d __A, __m256d __B) 1043 { 1044 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B); 1045 } 1046 1047 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1048 _mm256_unpackhi_ps (__m256 __A, __m256 __B) 1049 { 1050 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B); 1051 } 1052 1053 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1054 _mm256_unpacklo_ps (__m256 __A, __m256 __B) 1055 { 1056 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B); 1057 } 1058 1059 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1060 _mm_testz_pd (__m128d __M, __m128d __V) 1061 { 1062 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V); 1063 } 1064 1065 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1066 _mm_testc_pd (__m128d __M, __m128d __V) 1067 { 1068 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V); 1069 } 1070 1071 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1072 _mm_testnzc_pd (__m128d __M, __m128d __V) 1073 { 1074 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V); 1075 } 1076 1077 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1078 _mm_testz_ps (__m128 __M, __m128 __V) 1079 { 1080 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V); 1081 } 1082 1083 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1084 _mm_testc_ps (__m128 __M, __m128 __V) 1085 { 1086 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V); 1087 } 1088 1089 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1090 _mm_testnzc_ps (__m128 __M, __m128 __V) 1091 { 1092 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V); 1093 } 1094 1095 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1096 _mm256_testz_pd (__m256d __M, __m256d __V) 1097 { 1098 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V); 1099 } 1100 1101 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1102 _mm256_testc_pd (__m256d __M, __m256d __V) 1103 { 1104 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V); 1105 } 1106 1107 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1108 _mm256_testnzc_pd (__m256d __M, __m256d __V) 1109 { 1110 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V); 1111 } 1112 1113 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1114 _mm256_testz_ps (__m256 __M, __m256 __V) 1115 { 1116 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V); 1117 } 1118 1119 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1120 _mm256_testc_ps (__m256 __M, __m256 __V) 1121 { 1122 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V); 1123 } 1124 1125 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1126 _mm256_testnzc_ps (__m256 __M, __m256 __V) 1127 { 1128 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V); 1129 } 1130 1131 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1132 _mm256_testz_si256 (__m256i __M, __m256i __V) 1133 { 1134 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V); 1135 } 1136 1137 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1138 _mm256_testc_si256 (__m256i __M, __m256i __V) 1139 { 1140 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V); 1141 } 1142 1143 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1144 _mm256_testnzc_si256 (__m256i __M, __m256i __V) 1145 { 1146 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V); 1147 } 1148 1149 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1150 _mm256_movemask_pd (__m256d __A) 1151 { 1152 return __builtin_ia32_movmskpd256 ((__v4df)__A); 1153 } 1154 1155 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1156 _mm256_movemask_ps (__m256 __A) 1157 { 1158 return __builtin_ia32_movmskps256 ((__v8sf)__A); 1159 } 1160 1161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1162 _mm256_setzero_pd (void) 1163 { 1164 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; 1165 } 1166 1167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1168 _mm256_setzero_ps (void) 1169 { 1170 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0, 1171 0.0, 0.0, 0.0, 0.0 }; 1172 } 1173 1174 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1175 _mm256_setzero_si256 (void) 1176 { 1177 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; 1178 } 1179 1180 /* Create the vector [A B C D]. */ 1181 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1182 _mm256_set_pd (double __A, double __B, double __C, double __D) 1183 { 1184 return __extension__ (__m256d){ __D, __C, __B, __A }; 1185 } 1186 1187 /* Create the vector [A B C D E F G H]. */ 1188 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1189 _mm256_set_ps (float __A, float __B, float __C, float __D, 1190 float __E, float __F, float __G, float __H) 1191 { 1192 return __extension__ (__m256){ __H, __G, __F, __E, 1193 __D, __C, __B, __A }; 1194 } 1195 1196 /* Create the vector [A B C D E F G H]. */ 1197 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1198 _mm256_set_epi32 (int __A, int __B, int __C, int __D, 1199 int __E, int __F, int __G, int __H) 1200 { 1201 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E, 1202 __D, __C, __B, __A }; 1203 } 1204 1205 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1206 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12, 1207 short __q11, short __q10, short __q09, short __q08, 1208 short __q07, short __q06, short __q05, short __q04, 1209 short __q03, short __q02, short __q01, short __q00) 1210 { 1211 return __extension__ (__m256i)(__v16hi){ 1212 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 1213 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 1214 }; 1215 } 1216 1217 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1218 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28, 1219 char __q27, char __q26, char __q25, char __q24, 1220 char __q23, char __q22, char __q21, char __q20, 1221 char __q19, char __q18, char __q17, char __q16, 1222 char __q15, char __q14, char __q13, char __q12, 1223 char __q11, char __q10, char __q09, char __q08, 1224 char __q07, char __q06, char __q05, char __q04, 1225 char __q03, char __q02, char __q01, char __q00) 1226 { 1227 return __extension__ (__m256i)(__v32qi){ 1228 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 1229 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, 1230 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, 1231 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 1232 }; 1233 } 1234 1235 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1236 _mm256_set_epi64x (long long __A, long long __B, long long __C, 1237 long long __D) 1238 { 1239 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A }; 1240 } 1241 1242 /* Create a vector with all elements equal to A. */ 1243 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1244 _mm256_set1_pd (double __A) 1245 { 1246 return __extension__ (__m256d){ __A, __A, __A, __A }; 1247 } 1248 1249 /* Create a vector with all elements equal to A. */ 1250 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1251 _mm256_set1_ps (float __A) 1252 { 1253 return __extension__ (__m256){ __A, __A, __A, __A, 1254 __A, __A, __A, __A }; 1255 } 1256 1257 /* Create a vector with all elements equal to A. */ 1258 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1259 _mm256_set1_epi32 (int __A) 1260 { 1261 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A, 1262 __A, __A, __A, __A }; 1263 } 1264 1265 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1266 _mm256_set1_epi16 (short __A) 1267 { 1268 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A, 1269 __A, __A, __A, __A, __A, __A, __A, __A); 1270 } 1271 1272 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1273 _mm256_set1_epi8 (char __A) 1274 { 1275 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 1276 __A, __A, __A, __A, __A, __A, __A, __A, 1277 __A, __A, __A, __A, __A, __A, __A, __A, 1278 __A, __A, __A, __A, __A, __A, __A, __A); 1279 } 1280 1281 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1282 _mm256_set1_epi64x (long long __A) 1283 { 1284 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A }; 1285 } 1286 1287 /* Create vectors of elements in the reversed order from the 1288 _mm256_set_XXX functions. */ 1289 1290 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1291 _mm256_setr_pd (double __A, double __B, double __C, double __D) 1292 { 1293 return _mm256_set_pd (__D, __C, __B, __A); 1294 } 1295 1296 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1297 _mm256_setr_ps (float __A, float __B, float __C, float __D, 1298 float __E, float __F, float __G, float __H) 1299 { 1300 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A); 1301 } 1302 1303 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1304 _mm256_setr_epi32 (int __A, int __B, int __C, int __D, 1305 int __E, int __F, int __G, int __H) 1306 { 1307 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A); 1308 } 1309 1310 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1311 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12, 1312 short __q11, short __q10, short __q09, short __q08, 1313 short __q07, short __q06, short __q05, short __q04, 1314 short __q03, short __q02, short __q01, short __q00) 1315 { 1316 return _mm256_set_epi16 (__q00, __q01, __q02, __q03, 1317 __q04, __q05, __q06, __q07, 1318 __q08, __q09, __q10, __q11, 1319 __q12, __q13, __q14, __q15); 1320 } 1321 1322 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1323 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28, 1324 char __q27, char __q26, char __q25, char __q24, 1325 char __q23, char __q22, char __q21, char __q20, 1326 char __q19, char __q18, char __q17, char __q16, 1327 char __q15, char __q14, char __q13, char __q12, 1328 char __q11, char __q10, char __q09, char __q08, 1329 char __q07, char __q06, char __q05, char __q04, 1330 char __q03, char __q02, char __q01, char __q00) 1331 { 1332 return _mm256_set_epi8 (__q00, __q01, __q02, __q03, 1333 __q04, __q05, __q06, __q07, 1334 __q08, __q09, __q10, __q11, 1335 __q12, __q13, __q14, __q15, 1336 __q16, __q17, __q18, __q19, 1337 __q20, __q21, __q22, __q23, 1338 __q24, __q25, __q26, __q27, 1339 __q28, __q29, __q30, __q31); 1340 } 1341 1342 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1343 _mm256_setr_epi64x (long long __A, long long __B, long long __C, 1344 long long __D) 1345 { 1346 return _mm256_set_epi64x (__D, __C, __B, __A); 1347 } 1348 1349 /* Casts between various SP, DP, INT vector types. Note that these do no 1350 conversion of values, they just change the type. */ 1351 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1352 _mm256_castpd_ps (__m256d __A) 1353 { 1354 return (__m256) __A; 1355 } 1356 1357 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1358 _mm256_castpd_si256 (__m256d __A) 1359 { 1360 return (__m256i) __A; 1361 } 1362 1363 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1364 _mm256_castps_pd (__m256 __A) 1365 { 1366 return (__m256d) __A; 1367 } 1368 1369 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1370 _mm256_castps_si256(__m256 __A) 1371 { 1372 return (__m256i) __A; 1373 } 1374 1375 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1376 _mm256_castsi256_ps (__m256i __A) 1377 { 1378 return (__m256) __A; 1379 } 1380 1381 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1382 _mm256_castsi256_pd (__m256i __A) 1383 { 1384 return (__m256d) __A; 1385 } 1386 1387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1388 _mm256_castpd256_pd128 (__m256d __A) 1389 { 1390 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A); 1391 } 1392 1393 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1394 _mm256_castps256_ps128 (__m256 __A) 1395 { 1396 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A); 1397 } 1398 1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1400 _mm256_castsi256_si128 (__m256i __A) 1401 { 1402 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); 1403 } 1404 1405 /* When cast is done from a 128 to 256-bit type, the low 128 bits of 1406 the 256-bit result contain source parameter value and the upper 128 1407 bits of the result are undefined. Those intrinsics shouldn't 1408 generate any extra moves. */ 1409 1410 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1411 _mm256_castpd128_pd256 (__m128d __A) 1412 { 1413 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A); 1414 } 1415 1416 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1417 _mm256_castps128_ps256 (__m128 __A) 1418 { 1419 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A); 1420 } 1421 1422 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1423 _mm256_castsi128_si256 (__m128i __A) 1424 { 1425 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); 1426 } 1427