Home | History | Annotate | Download | only in include
      1 /* Copyright (C) 2008-2014 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 /* Implemented from the specification included in the Intel C++ Compiler
     25    User Guide and Reference, version 11.0.  */
     26 
     27 #ifndef _IMMINTRIN_H_INCLUDED
     28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
     29 #endif
     30 
     31 #ifndef _AVXINTRIN_H_INCLUDED
     32 #define _AVXINTRIN_H_INCLUDED
     33 
     34 #ifndef __AVX__
     35 #pragma GCC push_options
     36 #pragma GCC target("avx")
     37 #define __DISABLE_AVX__
     38 #endif /* __AVX__ */
     39 
     40 /* Internal data types for implementing the intrinsics.  */
     41 typedef double __v4df __attribute__ ((__vector_size__ (32)));
     42 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
     43 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
     44 typedef int __v8si __attribute__ ((__vector_size__ (32)));
     45 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
     46 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
     47 
     48 /* The Intel API is flexible enough that we must allow aliasing with other
     49    vector types, and their scalar components.  */
     50 typedef float __m256 __attribute__ ((__vector_size__ (32),
     51 				     __may_alias__));
     52 typedef long long __m256i __attribute__ ((__vector_size__ (32),
     53 					  __may_alias__));
     54 typedef double __m256d __attribute__ ((__vector_size__ (32),
     55 				       __may_alias__));
     56 
     57 /* Compare predicates for scalar and packed compare intrinsics.  */
     58 
     59 /* Equal (ordered, non-signaling)  */
     60 #define _CMP_EQ_OQ	0x00
     61 /* Less-than (ordered, signaling)  */
     62 #define _CMP_LT_OS	0x01
     63 /* Less-than-or-equal (ordered, signaling)  */
     64 #define _CMP_LE_OS	0x02
     65 /* Unordered (non-signaling)  */
     66 #define _CMP_UNORD_Q	0x03
     67 /* Not-equal (unordered, non-signaling)  */
     68 #define _CMP_NEQ_UQ	0x04
     69 /* Not-less-than (unordered, signaling)  */
     70 #define _CMP_NLT_US	0x05
     71 /* Not-less-than-or-equal (unordered, signaling)  */
     72 #define _CMP_NLE_US	0x06
     73 /* Ordered (nonsignaling)   */
     74 #define _CMP_ORD_Q	0x07
     75 /* Equal (unordered, non-signaling)  */
     76 #define _CMP_EQ_UQ	0x08
     77 /* Not-greater-than-or-equal (unordered, signaling)  */
     78 #define _CMP_NGE_US	0x09
     79 /* Not-greater-than (unordered, signaling)  */
     80 #define _CMP_NGT_US	0x0a
     81 /* False (ordered, non-signaling)  */
     82 #define _CMP_FALSE_OQ	0x0b
     83 /* Not-equal (ordered, non-signaling)  */
     84 #define _CMP_NEQ_OQ	0x0c
     85 /* Greater-than-or-equal (ordered, signaling)  */
     86 #define _CMP_GE_OS	0x0d
     87 /* Greater-than (ordered, signaling)  */
     88 #define _CMP_GT_OS	0x0e
     89 /* True (unordered, non-signaling)  */
     90 #define _CMP_TRUE_UQ	0x0f
     91 /* Equal (ordered, signaling)  */
     92 #define _CMP_EQ_OS	0x10
     93 /* Less-than (ordered, non-signaling)  */
     94 #define _CMP_LT_OQ	0x11
     95 /* Less-than-or-equal (ordered, non-signaling)  */
     96 #define _CMP_LE_OQ	0x12
     97 /* Unordered (signaling)  */
     98 #define _CMP_UNORD_S	0x13
     99 /* Not-equal (unordered, signaling)  */
    100 #define _CMP_NEQ_US	0x14
    101 /* Not-less-than (unordered, non-signaling)  */
    102 #define _CMP_NLT_UQ	0x15
    103 /* Not-less-than-or-equal (unordered, non-signaling)  */
    104 #define _CMP_NLE_UQ	0x16
    105 /* Ordered (signaling)  */
    106 #define _CMP_ORD_S	0x17
    107 /* Equal (unordered, signaling)  */
    108 #define _CMP_EQ_US	0x18
    109 /* Not-greater-than-or-equal (unordered, non-signaling)  */
    110 #define _CMP_NGE_UQ	0x19
    111 /* Not-greater-than (unordered, non-signaling)  */
    112 #define _CMP_NGT_UQ	0x1a
    113 /* False (ordered, signaling)  */
    114 #define _CMP_FALSE_OS	0x1b
    115 /* Not-equal (ordered, signaling)  */
    116 #define _CMP_NEQ_OS	0x1c
    117 /* Greater-than-or-equal (ordered, non-signaling)  */
    118 #define _CMP_GE_OQ	0x1d
    119 /* Greater-than (ordered, non-signaling)  */
    120 #define _CMP_GT_OQ	0x1e
    121 /* True (unordered, signaling)  */
    122 #define _CMP_TRUE_US	0x1f
    123 
    124 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    125 _mm256_add_pd (__m256d __A, __m256d __B)
    126 {
    127   return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
    128 }
    129 
    130 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    131 _mm256_add_ps (__m256 __A, __m256 __B)
    132 {
    133   return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
    134 }
    135 
    136 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    137 _mm256_addsub_pd (__m256d __A, __m256d __B)
    138 {
    139   return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
    140 }
    141 
    142 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    143 _mm256_addsub_ps (__m256 __A, __m256 __B)
    144 {
    145   return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
    146 }
    147 
    148 
    149 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    150 _mm256_and_pd (__m256d __A, __m256d __B)
    151 {
    152   return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
    153 }
    154 
    155 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    156 _mm256_and_ps (__m256 __A, __m256 __B)
    157 {
    158   return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
    159 }
    160 
    161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    162 _mm256_andnot_pd (__m256d __A, __m256d __B)
    163 {
    164   return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
    165 }
    166 
    167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    168 _mm256_andnot_ps (__m256 __A, __m256 __B)
    169 {
    170   return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
    171 }
    172 
    173 /* Double/single precision floating point blend instructions - select
    174    data from 2 sources using constant/variable mask.  */
    175 
    176 #ifdef __OPTIMIZE__
    177 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    178 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
    179 {
    180   return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
    181 					      (__v4df)__Y,
    182 					      __M);
    183 }
    184 
    185 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    186 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
    187 {
    188   return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
    189 					     (__v8sf)__Y,
    190 					     __M);
    191 }
    192 #else
    193 #define _mm256_blend_pd(X, Y, M)					\
    194   ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),		\
    195 					(__v4df)(__m256d)(Y), (int)(M)))
    196 
    197 #define _mm256_blend_ps(X, Y, M)					\
    198   ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),		\
    199 				       (__v8sf)(__m256)(Y), (int)(M)))
    200 #endif
    201 
    202 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    203 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
    204 {
    205   return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
    206 					       (__v4df)__Y,
    207 					       (__v4df)__M);
    208 }
    209 
    210 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    211 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
    212 {
    213   return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
    214 					      (__v8sf)__Y,
    215 					      (__v8sf)__M);
    216 }
    217 
    218 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    219 _mm256_div_pd (__m256d __A, __m256d __B)
    220 {
    221   return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
    222 }
    223 
    224 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    225 _mm256_div_ps (__m256 __A, __m256 __B)
    226 {
    227   return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
    228 }
    229 
    230 /* Dot product instructions with mask-defined summing and zeroing parts
    231    of result.  */
    232 
    233 #ifdef __OPTIMIZE__
    234 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    235 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
    236 {
    237   return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
    238 					  (__v8sf)__Y,
    239 					  __M);
    240 }
    241 #else
    242 #define _mm256_dp_ps(X, Y, M)						\
    243   ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),		\
    244 				    (__v8sf)(__m256)(Y), (int)(M)))
    245 #endif
    246 
    247 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    248 _mm256_hadd_pd (__m256d __X, __m256d __Y)
    249 {
    250   return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
    251 }
    252 
    253 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    254 _mm256_hadd_ps (__m256 __X, __m256 __Y)
    255 {
    256   return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
    257 }
    258 
    259 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    260 _mm256_hsub_pd (__m256d __X, __m256d __Y)
    261 {
    262   return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
    263 }
    264 
    265 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    266 _mm256_hsub_ps (__m256 __X, __m256 __Y)
    267 {
    268   return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
    269 }
    270 
    271 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    272 _mm256_max_pd (__m256d __A, __m256d __B)
    273 {
    274   return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
    275 }
    276 
    277 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    278 _mm256_max_ps (__m256 __A, __m256 __B)
    279 {
    280   return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
    281 }
    282 
    283 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    284 _mm256_min_pd (__m256d __A, __m256d __B)
    285 {
    286   return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
    287 }
    288 
    289 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    290 _mm256_min_ps (__m256 __A, __m256 __B)
    291 {
    292   return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
    293 }
    294 
    295 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    296 _mm256_mul_pd (__m256d __A, __m256d __B)
    297 {
    298   return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
    299 }
    300 
    301 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    302 _mm256_mul_ps (__m256 __A, __m256 __B)
    303 {
    304   return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
    305 }
    306 
    307 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    308 _mm256_or_pd (__m256d __A, __m256d __B)
    309 {
    310   return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
    311 }
    312 
    313 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    314 _mm256_or_ps (__m256 __A, __m256 __B)
    315 {
    316   return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
    317 }
    318 
    319 #ifdef __OPTIMIZE__
    320 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    321 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
    322 {
    323   return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
    324 					     __mask);
    325 }
    326 
    327 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    328 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
    329 {
    330   return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
    331 					    __mask);
    332 }
    333 #else
    334 #define _mm256_shuffle_pd(A, B, N)					\
    335   ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),		\
    336 				      (__v4df)(__m256d)(B), (int)(N)))
    337 
    338 #define _mm256_shuffle_ps(A, B, N)					\
    339   ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
    340 				      (__v8sf)(__m256)(B), (int)(N)))
    341 #endif
    342 
    343 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    344 _mm256_sub_pd (__m256d __A, __m256d __B)
    345 {
    346   return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
    347 }
    348 
    349 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    350 _mm256_sub_ps (__m256 __A, __m256 __B)
    351 {
    352   return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
    353 }
    354 
    355 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    356 _mm256_xor_pd (__m256d __A, __m256d __B)
    357 {
    358   return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
    359 }
    360 
    361 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    362 _mm256_xor_ps (__m256 __A, __m256 __B)
    363 {
    364   return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
    365 }
    366 
    367 #ifdef __OPTIMIZE__
    368 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    369 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
    370 {
    371   return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
    372 }
    373 
    374 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    375 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
    376 {
    377   return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
    378 }
    379 
    380 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    381 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
    382 {
    383   return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
    384 					    __P);
    385 }
    386 
    387 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    388 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
    389 {
    390   return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
    391 					   __P);
    392 }
    393 
    394 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    395 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
    396 {
    397   return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
    398 }
    399 
    400 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    401 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
    402 {
    403   return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
    404 }
    405 #else
    406 #define _mm_cmp_pd(X, Y, P)						\
    407   ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),		\
    408 				   (__v2df)(__m128d)(Y), (int)(P)))
    409 
    410 #define _mm_cmp_ps(X, Y, P)						\
    411   ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),			\
    412 				  (__v4sf)(__m128)(Y), (int)(P)))
    413 
    414 #define _mm256_cmp_pd(X, Y, P)						\
    415   ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),		\
    416 				      (__v4df)(__m256d)(Y), (int)(P)))
    417 
    418 #define _mm256_cmp_ps(X, Y, P)						\
    419   ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),		\
    420 				     (__v8sf)(__m256)(Y), (int)(P)))
    421 
    422 #define _mm_cmp_sd(X, Y, P)						\
    423   ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),		\
    424 				   (__v2df)(__m128d)(Y), (int)(P)))
    425 
    426 #define _mm_cmp_ss(X, Y, P)						\
    427   ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),			\
    428 				  (__v4sf)(__m128)(Y), (int)(P)))
    429 #endif
    430 
    431 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    432 _mm256_cvtepi32_pd (__m128i __A)
    433 {
    434   return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
    435 }
    436 
    437 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    438 _mm256_cvtepi32_ps (__m256i __A)
    439 {
    440   return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
    441 }
    442 
    443 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    444 _mm256_cvtpd_ps (__m256d __A)
    445 {
    446   return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
    447 }
    448 
    449 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    450 _mm256_cvtps_epi32 (__m256 __A)
    451 {
    452   return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
    453 }
    454 
    455 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    456 _mm256_cvtps_pd (__m128 __A)
    457 {
    458   return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
    459 }
    460 
    461 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    462 _mm256_cvttpd_epi32 (__m256d __A)
    463 {
    464   return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
    465 }
    466 
    467 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    468 _mm256_cvtpd_epi32 (__m256d __A)
    469 {
    470   return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
    471 }
    472 
    473 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    474 _mm256_cvttps_epi32 (__m256 __A)
    475 {
    476   return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
    477 }
    478 
    479 #ifdef __OPTIMIZE__
    480 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    481 _mm256_extractf128_pd (__m256d __X, const int __N)
    482 {
    483   return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
    484 }
    485 
    486 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    487 _mm256_extractf128_ps (__m256 __X, const int __N)
    488 {
    489   return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
    490 }
    491 
    492 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    493 _mm256_extractf128_si256 (__m256i __X, const int __N)
    494 {
    495   return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
    496 }
    497 
    498 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    499 _mm256_extract_epi32 (__m256i __X, int const __N)
    500 {
    501   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
    502   return _mm_extract_epi32 (__Y, __N % 4);
    503 }
    504 
    505 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    506 _mm256_extract_epi16 (__m256i __X, int const __N)
    507 {
    508   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
    509   return _mm_extract_epi16 (__Y, __N % 8);
    510 }
    511 
    512 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    513 _mm256_extract_epi8 (__m256i __X, int const __N)
    514 {
    515   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
    516   return _mm_extract_epi8 (__Y, __N % 16);
    517 }
    518 
    519 #ifdef __x86_64__
    520 extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    521 _mm256_extract_epi64 (__m256i __X, const int __N)
    522 {
    523   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
    524   return _mm_extract_epi64 (__Y, __N % 2);
    525 }
    526 #endif
    527 #else
    528 #define _mm256_extractf128_pd(X, N)					\
    529   ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),	\
    530 						(int)(N)))
    531 
    532 #define _mm256_extractf128_ps(X, N)					\
    533   ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),	\
    534 					       (int)(N)))
    535 
    536 #define _mm256_extractf128_si256(X, N)					\
    537   ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),	\
    538 						(int)(N)))
    539 
    540 #define _mm256_extract_epi32(X, N)					\
    541   (__extension__							\
    542    ({									\
    543       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
    544       _mm_extract_epi32 (__Y, (N) % 4);					\
    545     }))
    546 
    547 #define _mm256_extract_epi16(X, N)					\
    548   (__extension__							\
    549    ({									\
    550       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
    551       _mm_extract_epi16 (__Y, (N) % 8);					\
    552     }))
    553 
    554 #define _mm256_extract_epi8(X, N)					\
    555   (__extension__							\
    556    ({									\
    557       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
    558       _mm_extract_epi8 (__Y, (N) % 16);					\
    559     }))
    560 
    561 #ifdef __x86_64__
    562 #define _mm256_extract_epi64(X, N)					\
    563   (__extension__							\
    564    ({									\
    565       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
    566       _mm_extract_epi64 (__Y, (N) % 2);					\
    567     }))
    568 #endif
    569 #endif
    570 
    571 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    572 _mm256_zeroall (void)
    573 {
    574   __builtin_ia32_vzeroall ();
    575 }
    576 
    577 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    578 _mm256_zeroupper (void)
    579 {
    580   __builtin_ia32_vzeroupper ();
    581 }
    582 
    583 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    584 _mm_permutevar_pd (__m128d __A, __m128i __C)
    585 {
    586   return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
    587 						(__v2di)__C);
    588 }
    589 
    590 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    591 _mm256_permutevar_pd (__m256d __A, __m256i __C)
    592 {
    593   return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
    594 						   (__v4di)__C);
    595 }
    596 
    597 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    598 _mm_permutevar_ps (__m128 __A, __m128i __C)
    599 {
    600   return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
    601 					       (__v4si)__C);
    602 }
    603 
    604 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    605 _mm256_permutevar_ps (__m256 __A, __m256i __C)
    606 {
    607   return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
    608 						  (__v8si)__C);
    609 }
    610 
    611 #ifdef __OPTIMIZE__
    612 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    613 _mm_permute_pd (__m128d __X, const int __C)
    614 {
    615   return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
    616 }
    617 
    618 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    619 _mm256_permute_pd (__m256d __X, const int __C)
    620 {
    621   return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
    622 }
    623 
    624 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    625 _mm_permute_ps (__m128 __X, const int __C)
    626 {
    627   return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
    628 }
    629 
    630 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    631 _mm256_permute_ps (__m256 __X, const int __C)
    632 {
    633   return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
    634 }
    635 #else
    636 #define _mm_permute_pd(X, C)						\
    637   ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
    638 
    639 #define _mm256_permute_pd(X, C)						\
    640   ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))
    641 
    642 #define _mm_permute_ps(X, C)						\
    643   ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
    644 
    645 #define _mm256_permute_ps(X, C)						\
    646   ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
    647 #endif
    648 
    649 #ifdef __OPTIMIZE__
    650 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    651 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
    652 {
    653   return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
    654 						    (__v4df)__Y,
    655 						    __C);
    656 }
    657 
    658 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    659 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
    660 {
    661   return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
    662 						   (__v8sf)__Y,
    663 						   __C);
    664 }
    665 
    666 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    667 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
    668 {
    669   return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
    670 						    (__v8si)__Y,
    671 						    __C);
    672 }
    673 #else
    674 #define _mm256_permute2f128_pd(X, Y, C)					\
    675   ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
    676 					      (__v4df)(__m256d)(Y),	\
    677 					      (int)(C)))
    678 
    679 #define _mm256_permute2f128_ps(X, Y, C)					\
    680   ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
    681 					     (__v8sf)(__m256)(Y),	\
    682 					     (int)(C)))
    683 
    684 #define _mm256_permute2f128_si256(X, Y, C)				\
    685   ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
    686 					      (__v8si)(__m256i)(Y),	\
    687 					      (int)(C)))
    688 #endif
    689 
    690 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    691 _mm_broadcast_ss (float const *__X)
    692 {
    693   return (__m128) __builtin_ia32_vbroadcastss (__X);
    694 }
    695 
    696 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    697 _mm256_broadcast_sd (double const *__X)
    698 {
    699   return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
    700 }
    701 
    702 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    703 _mm256_broadcast_ss (float const *__X)
    704 {
    705   return (__m256) __builtin_ia32_vbroadcastss256 (__X);
    706 }
    707 
    708 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    709 _mm256_broadcast_pd (__m128d const *__X)
    710 {
    711   return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
    712 }
    713 
    714 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    715 _mm256_broadcast_ps (__m128 const *__X)
    716 {
    717   return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
    718 }
    719 
    720 #ifdef __OPTIMIZE__
    721 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    722 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
    723 {
    724   return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
    725 						     (__v2df)__Y,
    726 						     __O);
    727 }
    728 
    729 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    730 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
    731 {
    732   return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
    733 						    (__v4sf)__Y,
    734 						    __O);
    735 }
    736 
    737 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    738 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
    739 {
    740   return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
    741 						     (__v4si)__Y,
    742 						     __O);
    743 }
    744 
    745 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    746 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
    747 {
    748   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
    749   __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
    750   return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
    751 }
    752 
    753 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    754 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
    755 {
    756   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
    757   __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
    758   return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
    759 }
    760 
    761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    762 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
    763 {
    764   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
    765   __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
    766   return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
    767 }
    768 
    769 #ifdef __x86_64__
    770 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    771 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
    772 {
    773   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
    774   __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
    775   return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
    776 }
    777 #endif
    778 #else
    779 #define _mm256_insertf128_pd(X, Y, O)					\
    780   ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),	\
    781 					       (__v2df)(__m128d)(Y),	\
    782 					       (int)(O)))
    783 
    784 #define _mm256_insertf128_ps(X, Y, O)					\
    785   ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),	\
    786 					      (__v4sf)(__m128)(Y),  	\
    787 					      (int)(O)))
    788 
    789 #define _mm256_insertf128_si256(X, Y, O)				\
    790   ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),	\
    791 					       (__v4si)(__m128i)(Y),	\
    792 					       (int)(O)))
    793 
    794 #define _mm256_insert_epi32(X, D, N)					\
    795   (__extension__							\
    796    ({									\
    797       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
    798       __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);			\
    799       _mm256_insertf128_si256 ((X), __Y, (N) >> 2);			\
    800     }))
    801 
    802 #define _mm256_insert_epi16(X, D, N)					\
    803   (__extension__							\
    804    ({									\
    805       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
    806       __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);			\
    807       _mm256_insertf128_si256 ((X), __Y, (N) >> 3);			\
    808     }))
    809 
    810 #define _mm256_insert_epi8(X, D, N)					\
    811   (__extension__							\
    812    ({									\
    813       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
    814       __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);			\
    815       _mm256_insertf128_si256 ((X), __Y, (N) >> 4);			\
    816     }))
    817 
    818 #ifdef __x86_64__
    819 #define _mm256_insert_epi64(X, D, N)					\
    820   (__extension__							\
    821    ({									\
    822       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
    823       __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);			\
    824       _mm256_insertf128_si256 ((X), __Y, (N) >> 1);			\
    825     }))
    826 #endif
    827 #endif
    828 
    829 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    830 _mm256_load_pd (double const *__P)
    831 {
    832   return *(__m256d *)__P;
    833 }
    834 
    835 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    836 _mm256_store_pd (double *__P, __m256d __A)
    837 {
    838   *(__m256d *)__P = __A;
    839 }
    840 
    841 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    842 _mm256_load_ps (float const *__P)
    843 {
    844   return *(__m256 *)__P;
    845 }
    846 
    847 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    848 _mm256_store_ps (float *__P, __m256 __A)
    849 {
    850   *(__m256 *)__P = __A;
    851 }
    852 
    853 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    854 _mm256_loadu_pd (double const *__P)
    855 {
    856   return (__m256d) __builtin_ia32_loadupd256 (__P);
    857 }
    858 
    859 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    860 _mm256_storeu_pd (double *__P, __m256d __A)
    861 {
    862   __builtin_ia32_storeupd256 (__P, (__v4df)__A);
    863 }
    864 
    865 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    866 _mm256_loadu_ps (float const *__P)
    867 {
    868   return (__m256) __builtin_ia32_loadups256 (__P);
    869 }
    870 
    871 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    872 _mm256_storeu_ps (float *__P, __m256 __A)
    873 {
    874   __builtin_ia32_storeups256 (__P, (__v8sf)__A);
    875 }
    876 
    877 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    878 _mm256_load_si256 (__m256i const *__P)
    879 {
    880   return *__P;
    881 }
    882 
    883 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    884 _mm256_store_si256 (__m256i *__P, __m256i __A)
    885 {
    886   *__P = __A;
    887 }
    888 
    889 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    890 _mm256_loadu_si256 (__m256i const *__P)
    891 {
    892   return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
    893 }
    894 
    895 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    896 _mm256_storeu_si256 (__m256i *__P, __m256i __A)
    897 {
    898   __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
    899 }
    900 
    901 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    902 _mm_maskload_pd (double const *__P, __m128i __M)
    903 {
    904   return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
    905 					      (__v2di)__M);
    906 }
    907 
    908 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    909 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
    910 {
    911   __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
    912 }
    913 
    914 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    915 _mm256_maskload_pd (double const *__P, __m256i __M)
    916 {
    917   return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
    918 						 (__v4di)__M);
    919 }
    920 
    921 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    922 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
    923 {
    924   __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
    925 }
    926 
    927 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    928 _mm_maskload_ps (float const *__P, __m128i __M)
    929 {
    930   return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
    931 					     (__v4si)__M);
    932 }
    933 
    934 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    935 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
    936 {
    937   __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
    938 }
    939 
    940 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    941 _mm256_maskload_ps (float const *__P, __m256i __M)
    942 {
    943   return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
    944 						(__v8si)__M);
    945 }
    946 
    947 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    948 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
    949 {
    950   __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
    951 }
    952 
    953 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    954 _mm256_movehdup_ps (__m256 __X)
    955 {
    956   return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
    957 }
    958 
    959 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    960 _mm256_moveldup_ps (__m256 __X)
    961 {
    962   return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
    963 }
    964 
    965 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    966 _mm256_movedup_pd (__m256d __X)
    967 {
    968   return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
    969 }
    970 
    971 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    972 _mm256_lddqu_si256 (__m256i const *__P)
    973 {
    974   return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
    975 }
    976 
    977 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    978 _mm256_stream_si256 (__m256i *__A, __m256i __B)
    979 {
    980   __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
    981 }
    982 
    983 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    984 _mm256_stream_pd (double *__A, __m256d __B)
    985 {
    986   __builtin_ia32_movntpd256 (__A, (__v4df)__B);
    987 }
    988 
    989 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    990 _mm256_stream_ps (float *__P, __m256 __A)
    991 {
    992   __builtin_ia32_movntps256 (__P, (__v8sf)__A);
    993 }
    994 
    995 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    996 _mm256_rcp_ps (__m256 __A)
    997 {
    998   return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
    999 }
   1000 
   1001 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1002 _mm256_rsqrt_ps (__m256 __A)
   1003 {
   1004   return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
   1005 }
   1006 
   1007 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1008 _mm256_sqrt_pd (__m256d __A)
   1009 {
   1010   return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
   1011 }
   1012 
   1013 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1014 _mm256_sqrt_ps (__m256 __A)
   1015 {
   1016   return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
   1017 }
   1018 
   1019 #ifdef __OPTIMIZE__
   1020 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1021 _mm256_round_pd (__m256d __V, const int __M)
   1022 {
   1023   return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
   1024 }
   1025 
   1026 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1027 _mm256_round_ps (__m256 __V, const int __M)
   1028 {
   1029   return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
   1030 }
   1031 #else
   1032 #define _mm256_round_pd(V, M) \
   1033   ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
   1034 
   1035 #define _mm256_round_ps(V, M) \
   1036   ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
   1037 #endif
   1038 
   1039 #define _mm256_ceil_pd(V)	_mm256_round_pd ((V), _MM_FROUND_CEIL)
   1040 #define _mm256_floor_pd(V)	_mm256_round_pd ((V), _MM_FROUND_FLOOR)
   1041 #define _mm256_ceil_ps(V)	_mm256_round_ps ((V), _MM_FROUND_CEIL)
   1042 #define _mm256_floor_ps(V)	_mm256_round_ps ((V), _MM_FROUND_FLOOR)
   1043 
   1044 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1045 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
   1046 {
   1047   return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
   1048 }
   1049 
   1050 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1051 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
   1052 {
   1053   return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
   1054 }
   1055 
   1056 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1057 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
   1058 {
   1059   return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
   1060 }
   1061 
   1062 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1063 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
   1064 {
   1065   return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
   1066 }
   1067 
   1068 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1069 _mm_testz_pd (__m128d __M, __m128d __V)
   1070 {
   1071   return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
   1072 }
   1073 
   1074 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1075 _mm_testc_pd (__m128d __M, __m128d __V)
   1076 {
   1077   return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
   1078 }
   1079 
   1080 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1081 _mm_testnzc_pd (__m128d __M, __m128d __V)
   1082 {
   1083   return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
   1084 }
   1085 
   1086 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1087 _mm_testz_ps (__m128 __M, __m128 __V)
   1088 {
   1089   return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
   1090 }
   1091 
   1092 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1093 _mm_testc_ps (__m128 __M, __m128 __V)
   1094 {
   1095   return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
   1096 }
   1097 
   1098 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1099 _mm_testnzc_ps (__m128 __M, __m128 __V)
   1100 {
   1101   return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
   1102 }
   1103 
   1104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1105 _mm256_testz_pd (__m256d __M, __m256d __V)
   1106 {
   1107   return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
   1108 }
   1109 
   1110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1111 _mm256_testc_pd (__m256d __M, __m256d __V)
   1112 {
   1113   return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
   1114 }
   1115 
   1116 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1117 _mm256_testnzc_pd (__m256d __M, __m256d __V)
   1118 {
   1119   return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
   1120 }
   1121 
   1122 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1123 _mm256_testz_ps (__m256 __M, __m256 __V)
   1124 {
   1125   return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
   1126 }
   1127 
   1128 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1129 _mm256_testc_ps (__m256 __M, __m256 __V)
   1130 {
   1131   return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
   1132 }
   1133 
   1134 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1135 _mm256_testnzc_ps (__m256 __M, __m256 __V)
   1136 {
   1137   return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
   1138 }
   1139 
   1140 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1141 _mm256_testz_si256 (__m256i __M, __m256i __V)
   1142 {
   1143   return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
   1144 }
   1145 
   1146 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1147 _mm256_testc_si256 (__m256i __M, __m256i __V)
   1148 {
   1149   return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
   1150 }
   1151 
   1152 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1153 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
   1154 {
   1155   return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
   1156 }
   1157 
   1158 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1159 _mm256_movemask_pd (__m256d __A)
   1160 {
   1161   return __builtin_ia32_movmskpd256 ((__v4df)__A);
   1162 }
   1163 
   1164 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1165 _mm256_movemask_ps (__m256 __A)
   1166 {
   1167   return __builtin_ia32_movmskps256 ((__v8sf)__A);
   1168 }
   1169 
   1170 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1171 _mm256_undefined_pd (void)
   1172 {
   1173   __m256d __Y = __Y;
   1174   return __Y;
   1175 }
   1176 
   1177 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1178 _mm256_undefined_ps (void)
   1179 {
   1180   __m256 __Y = __Y;
   1181   return __Y;
   1182 }
   1183 
   1184 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1185 _mm256_undefined_si256 (void)
   1186 {
   1187   __m256i __Y = __Y;
   1188   return __Y;
   1189 }
   1190 
   1191 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1192 _mm256_setzero_pd (void)
   1193 {
   1194   return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
   1195 }
   1196 
   1197 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1198 _mm256_setzero_ps (void)
   1199 {
   1200   return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
   1201 				 0.0, 0.0, 0.0, 0.0 };
   1202 }
   1203 
   1204 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1205 _mm256_setzero_si256 (void)
   1206 {
   1207   return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
   1208 }
   1209 
   1210 /* Create the vector [A B C D].  */
   1211 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1212 _mm256_set_pd (double __A, double __B, double __C, double __D)
   1213 {
   1214   return __extension__ (__m256d){ __D, __C, __B, __A };
   1215 }
   1216 
   1217 /* Create the vector [A B C D E F G H].  */
   1218 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1219 _mm256_set_ps (float __A, float __B, float __C, float __D,
   1220 	       float __E, float __F, float __G, float __H)
   1221 {
   1222   return __extension__ (__m256){ __H, __G, __F, __E,
   1223 				 __D, __C, __B, __A };
   1224 }
   1225 
   1226 /* Create the vector [A B C D E F G H].  */
   1227 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1228 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
   1229 		  int __E, int __F, int __G, int __H)
   1230 {
   1231   return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
   1232 					  __D, __C, __B, __A };
   1233 }
   1234 
   1235 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1236 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
   1237 		  short __q11, short __q10, short __q09, short __q08,
   1238 		  short __q07, short __q06, short __q05, short __q04,
   1239 		  short __q03, short __q02, short __q01, short __q00)
   1240 {
   1241   return __extension__ (__m256i)(__v16hi){
   1242     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
   1243     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
   1244   };
   1245 }
   1246 
   1247 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1248 _mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
   1249 		  char __q27, char __q26, char __q25, char __q24,
   1250 		  char __q23, char __q22, char __q21, char __q20,
   1251 		  char __q19, char __q18, char __q17, char __q16,
   1252 		  char __q15, char __q14, char __q13, char __q12,
   1253 		  char __q11, char __q10, char __q09, char __q08,
   1254 		  char __q07, char __q06, char __q05, char __q04,
   1255 		  char __q03, char __q02, char __q01, char __q00)
   1256 {
   1257   return __extension__ (__m256i)(__v32qi){
   1258     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
   1259     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
   1260     __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
   1261     __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
   1262   };
   1263 }
   1264 
   1265 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1266 _mm256_set_epi64x (long long __A, long long __B, long long __C,
   1267 		   long long __D)
   1268 {
   1269   return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
   1270 }
   1271 
   1272 /* Create a vector with all elements equal to A.  */
   1273 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1274 _mm256_set1_pd (double __A)
   1275 {
   1276   return __extension__ (__m256d){ __A, __A, __A, __A };
   1277 }
   1278 
   1279 /* Create a vector with all elements equal to A.  */
   1280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1281 _mm256_set1_ps (float __A)
   1282 {
   1283   return __extension__ (__m256){ __A, __A, __A, __A,
   1284 				 __A, __A, __A, __A };
   1285 }
   1286 
   1287 /* Create a vector with all elements equal to A.  */
   1288 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1289 _mm256_set1_epi32 (int __A)
   1290 {
   1291   return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
   1292 					  __A, __A, __A, __A };
   1293 }
   1294 
   1295 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1296 _mm256_set1_epi16 (short __A)
   1297 {
   1298   return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
   1299 			   __A, __A, __A, __A, __A, __A, __A, __A);
   1300 }
   1301 
   1302 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1303 _mm256_set1_epi8 (char __A)
   1304 {
   1305   return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
   1306 			  __A, __A, __A, __A, __A, __A, __A, __A,
   1307 			  __A, __A, __A, __A, __A, __A, __A, __A,
   1308 			  __A, __A, __A, __A, __A, __A, __A, __A);
   1309 }
   1310 
   1311 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1312 _mm256_set1_epi64x (long long __A)
   1313 {
   1314   return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
   1315 }
   1316 
   1317 /* Create vectors of elements in the reversed order from the
   1318    _mm256_set_XXX functions.  */
   1319 
   1320 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1321 _mm256_setr_pd (double __A, double __B, double __C, double __D)
   1322 {
   1323   return _mm256_set_pd (__D, __C, __B, __A);
   1324 }
   1325 
   1326 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1327 _mm256_setr_ps (float __A, float __B, float __C, float __D,
   1328 		float __E, float __F, float __G, float __H)
   1329 {
   1330   return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
   1331 }
   1332 
   1333 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1334 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
   1335 		   int __E, int __F, int __G, int __H)
   1336 {
   1337   return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
   1338 }
   1339 
   1340 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1341 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
   1342 		   short __q11, short __q10, short __q09, short __q08,
   1343 		   short __q07, short __q06, short __q05, short __q04,
   1344 		   short __q03, short __q02, short __q01, short __q00)
   1345 {
   1346   return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
   1347 			   __q04, __q05, __q06, __q07,
   1348 			   __q08, __q09, __q10, __q11,
   1349 			   __q12, __q13, __q14, __q15);
   1350 }
   1351 
   1352 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1353 _mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
   1354 		   char __q27, char __q26, char __q25, char __q24,
   1355 		   char __q23, char __q22, char __q21, char __q20,
   1356 		   char __q19, char __q18, char __q17, char __q16,
   1357 		   char __q15, char __q14, char __q13, char __q12,
   1358 		   char __q11, char __q10, char __q09, char __q08,
   1359 		   char __q07, char __q06, char __q05, char __q04,
   1360 		   char __q03, char __q02, char __q01, char __q00)
   1361 {
   1362   return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
   1363 			  __q04, __q05, __q06, __q07,
   1364 			  __q08, __q09, __q10, __q11,
   1365 			  __q12, __q13, __q14, __q15,
   1366 			  __q16, __q17, __q18, __q19,
   1367 			  __q20, __q21, __q22, __q23,
   1368 			  __q24, __q25, __q26, __q27,
   1369 			  __q28, __q29, __q30, __q31);
   1370 }
   1371 
   1372 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1373 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
   1374 		    long long __D)
   1375 {
   1376   return _mm256_set_epi64x (__D, __C, __B, __A);
   1377 }
   1378 
   1379 /* Casts between various SP, DP, INT vector types.  Note that these do no
   1380    conversion of values, they just change the type.  */
   1381 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1382 _mm256_castpd_ps (__m256d __A)
   1383 {
   1384   return (__m256) __A;
   1385 }
   1386 
   1387 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1388 _mm256_castpd_si256 (__m256d __A)
   1389 {
   1390   return (__m256i) __A;
   1391 }
   1392 
   1393 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1394 _mm256_castps_pd (__m256 __A)
   1395 {
   1396   return (__m256d) __A;
   1397 }
   1398 
   1399 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1400 _mm256_castps_si256(__m256 __A)
   1401 {
   1402   return (__m256i) __A;
   1403 }
   1404 
   1405 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1406 _mm256_castsi256_ps (__m256i __A)
   1407 {
   1408   return (__m256) __A;
   1409 }
   1410 
   1411 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1412 _mm256_castsi256_pd (__m256i __A)
   1413 {
   1414   return (__m256d) __A;
   1415 }
   1416 
   1417 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1418 _mm256_castpd256_pd128 (__m256d __A)
   1419 {
   1420   return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
   1421 }
   1422 
   1423 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1424 _mm256_castps256_ps128 (__m256 __A)
   1425 {
   1426   return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
   1427 }
   1428 
   1429 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1430 _mm256_castsi256_si128 (__m256i __A)
   1431 {
   1432   return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
   1433 }
   1434 
   1435 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
   1436    the 256-bit result contain source parameter value and the upper 128
   1437    bits of the result are undefined.  Those intrinsics shouldn't
   1438    generate any extra moves.  */
   1439 
   1440 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1441 _mm256_castpd128_pd256 (__m128d __A)
   1442 {
   1443   return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
   1444 }
   1445 
   1446 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1447 _mm256_castps128_ps256 (__m128 __A)
   1448 {
   1449   return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
   1450 }
   1451 
   1452 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1453 _mm256_castsi128_si256 (__m128i __A)
   1454 {
   1455   return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
   1456 }
   1457 
   1458 #ifdef __DISABLE_AVX__
   1459 #undef __DISABLE_AVX__
   1460 #pragma GCC pop_options
   1461 #endif /* __DISABLE_AVX__ */
   1462 
   1463 #endif /* _AVXINTRIN_H_INCLUDED */
   1464