Home | History | Annotate | Download | only in include
      1 /* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 /* Implemented from the specification included in the Intel C++ Compiler
     25    User Guide and Reference, version 11.0.  */
     26 
     27 #ifndef _IMMINTRIN_H_INCLUDED
     28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
     29 #endif
     30 
     31 /* Internal data types for implementing the intrinsics.  */
     32 typedef double __v4df __attribute__ ((__vector_size__ (32)));
     33 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
     34 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
     35 typedef int __v8si __attribute__ ((__vector_size__ (32)));
     36 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
     37 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
     38 
     39 /* The Intel API is flexible enough that we must allow aliasing with other
     40    vector types, and their scalar components.  */
     41 typedef float __m256 __attribute__ ((__vector_size__ (32),
     42 				     __may_alias__));
     43 typedef long long __m256i __attribute__ ((__vector_size__ (32),
     44 					  __may_alias__));
     45 typedef double __m256d __attribute__ ((__vector_size__ (32),
     46 				       __may_alias__));
     47 
     48 /* Compare predicates for scalar and packed compare intrinsics.  */
     49 
     50 /* Equal (ordered, non-signaling)  */
     51 #define _CMP_EQ_OQ	0x00
     52 /* Less-than (ordered, signaling)  */
     53 #define _CMP_LT_OS	0x01
     54 /* Less-than-or-equal (ordered, signaling)  */
     55 #define _CMP_LE_OS	0x02
     56 /* Unordered (non-signaling)  */
     57 #define _CMP_UNORD_Q	0x03
     58 /* Not-equal (unordered, non-signaling)  */
     59 #define _CMP_NEQ_UQ	0x04
     60 /* Not-less-than (unordered, signaling)  */
     61 #define _CMP_NLT_US	0x05
     62 /* Not-less-than-or-equal (unordered, signaling)  */
     63 #define _CMP_NLE_US	0x06
     64 /* Ordered (nonsignaling)   */
     65 #define _CMP_ORD_Q	0x07
     66 /* Equal (unordered, non-signaling)  */
     67 #define _CMP_EQ_UQ	0x08
     68 /* Not-greater-than-or-equal (unordered, signaling)  */
     69 #define _CMP_NGE_US	0x09
     70 /* Not-greater-than (unordered, signaling)  */
     71 #define _CMP_NGT_US	0x0a
     72 /* False (ordered, non-signaling)  */
     73 #define _CMP_FALSE_OQ	0x0b
     74 /* Not-equal (ordered, non-signaling)  */
     75 #define _CMP_NEQ_OQ	0x0c
     76 /* Greater-than-or-equal (ordered, signaling)  */
     77 #define _CMP_GE_OS	0x0d
     78 /* Greater-than (ordered, signaling)  */
     79 #define _CMP_GT_OS	0x0e
     80 /* True (unordered, non-signaling)  */
     81 #define _CMP_TRUE_UQ	0x0f
     82 /* Equal (ordered, signaling)  */
     83 #define _CMP_EQ_OS	0x10
     84 /* Less-than (ordered, non-signaling)  */
     85 #define _CMP_LT_OQ	0x11
     86 /* Less-than-or-equal (ordered, non-signaling)  */
     87 #define _CMP_LE_OQ	0x12
     88 /* Unordered (signaling)  */
     89 #define _CMP_UNORD_S	0x13
     90 /* Not-equal (unordered, signaling)  */
     91 #define _CMP_NEQ_US	0x14
     92 /* Not-less-than (unordered, non-signaling)  */
     93 #define _CMP_NLT_UQ	0x15
     94 /* Not-less-than-or-equal (unordered, non-signaling)  */
     95 #define _CMP_NLE_UQ	0x16
     96 /* Ordered (signaling)  */
     97 #define _CMP_ORD_S	0x17
     98 /* Equal (unordered, signaling)  */
     99 #define _CMP_EQ_US	0x18
    100 /* Not-greater-than-or-equal (unordered, non-signaling)  */
    101 #define _CMP_NGE_UQ	0x19
    102 /* Not-greater-than (unordered, non-signaling)  */
    103 #define _CMP_NGT_UQ	0x1a
    104 /* False (ordered, signaling)  */
    105 #define _CMP_FALSE_OS	0x1b
    106 /* Not-equal (ordered, signaling)  */
    107 #define _CMP_NEQ_OS	0x1c
    108 /* Greater-than-or-equal (ordered, non-signaling)  */
    109 #define _CMP_GE_OQ	0x1d
    110 /* Greater-than (ordered, non-signaling)  */
    111 #define _CMP_GT_OQ	0x1e
    112 /* True (unordered, signaling)  */
    113 #define _CMP_TRUE_US	0x1f
    114 
    115 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    116 _mm256_add_pd (__m256d __A, __m256d __B)
    117 {
    118   return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
    119 }
    120 
    121 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    122 _mm256_add_ps (__m256 __A, __m256 __B)
    123 {
    124   return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
    125 }
    126 
    127 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    128 _mm256_addsub_pd (__m256d __A, __m256d __B)
    129 {
    130   return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
    131 }
    132 
    133 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    134 _mm256_addsub_ps (__m256 __A, __m256 __B)
    135 {
    136   return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
    137 }
    138 
    139 
    140 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    141 _mm256_and_pd (__m256d __A, __m256d __B)
    142 {
    143   return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
    144 }
    145 
    146 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    147 _mm256_and_ps (__m256 __A, __m256 __B)
    148 {
    149   return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
    150 }
    151 
    152 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    153 _mm256_andnot_pd (__m256d __A, __m256d __B)
    154 {
    155   return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
    156 }
    157 
    158 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    159 _mm256_andnot_ps (__m256 __A, __m256 __B)
    160 {
    161   return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
    162 }
    163 
    164 /* Double/single precision floating point blend instructions - select
    165    data from 2 sources using constant/variable mask.  */
    166 
    167 #ifdef __OPTIMIZE__
    168 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    169 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
    170 {
    171   return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
    172 					      (__v4df)__Y,
    173 					      __M);
    174 }
    175 
    176 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    177 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
    178 {
    179   return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
    180 					     (__v8sf)__Y,
    181 					     __M);
    182 }
    183 #else
    184 #define _mm256_blend_pd(X, Y, M)					\
    185   ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),		\
    186 					(__v4df)(__m256d)(Y), (int)(M)))
    187 
    188 #define _mm256_blend_ps(X, Y, M)					\
    189   ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),		\
    190 				       (__v8sf)(__m256)(Y), (int)(M)))
    191 #endif
    192 
    193 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    194 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
    195 {
    196   return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
    197 					       (__v4df)__Y,
    198 					       (__v4df)__M);
    199 }
    200 
    201 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    202 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
    203 {
    204   return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
    205 					      (__v8sf)__Y,
    206 					      (__v8sf)__M);
    207 }
    208 
    209 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    210 _mm256_div_pd (__m256d __A, __m256d __B)
    211 {
    212   return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
    213 }
    214 
    215 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    216 _mm256_div_ps (__m256 __A, __m256 __B)
    217 {
    218   return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
    219 }
    220 
    221 /* Dot product instructions with mask-defined summing and zeroing parts
    222    of result.  */
    223 
    224 #ifdef __OPTIMIZE__
    225 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    226 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
    227 {
    228   return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
    229 					  (__v8sf)__Y,
    230 					  __M);
    231 }
    232 #else
    233 #define _mm256_dp_ps(X, Y, M)						\
    234   ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),		\
    235 				    (__v8sf)(__m256)(Y), (int)(M)))
    236 #endif
    237 
    238 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    239 _mm256_hadd_pd (__m256d __X, __m256d __Y)
    240 {
    241   return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
    242 }
    243 
    244 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    245 _mm256_hadd_ps (__m256 __X, __m256 __Y)
    246 {
    247   return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
    248 }
    249 
    250 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    251 _mm256_hsub_pd (__m256d __X, __m256d __Y)
    252 {
    253   return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
    254 }
    255 
    256 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    257 _mm256_hsub_ps (__m256 __X, __m256 __Y)
    258 {
    259   return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
    260 }
    261 
    262 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    263 _mm256_max_pd (__m256d __A, __m256d __B)
    264 {
    265   return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
    266 }
    267 
    268 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    269 _mm256_max_ps (__m256 __A, __m256 __B)
    270 {
    271   return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
    272 }
    273 
    274 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    275 _mm256_min_pd (__m256d __A, __m256d __B)
    276 {
    277   return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
    278 }
    279 
    280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    281 _mm256_min_ps (__m256 __A, __m256 __B)
    282 {
    283   return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
    284 }
    285 
    286 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    287 _mm256_mul_pd (__m256d __A, __m256d __B)
    288 {
    289   return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
    290 }
    291 
    292 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    293 _mm256_mul_ps (__m256 __A, __m256 __B)
    294 {
    295   return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
    296 }
    297 
    298 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    299 _mm256_or_pd (__m256d __A, __m256d __B)
    300 {
    301   return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
    302 }
    303 
    304 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    305 _mm256_or_ps (__m256 __A, __m256 __B)
    306 {
    307   return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
    308 }
    309 
    310 #ifdef __OPTIMIZE__
    311 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    312 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
    313 {
    314   return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
    315 					     __mask);
    316 }
    317 
    318 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    319 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
    320 {
    321   return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
    322 					    __mask);
    323 }
    324 #else
    325 #define _mm256_shuffle_pd(A, B, N)					\
    326   ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),		\
    327 				      (__v4df)(__m256d)(B), (int)(N)))
    328 
    329 #define _mm256_shuffle_ps(A, B, N)					\
    330   ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
    331 				      (__v8sf)(__m256)(B), (int)(N)))
    332 #endif
    333 
    334 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    335 _mm256_sub_pd (__m256d __A, __m256d __B)
    336 {
    337   return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
    338 }
    339 
    340 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    341 _mm256_sub_ps (__m256 __A, __m256 __B)
    342 {
    343   return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
    344 }
    345 
    346 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    347 _mm256_xor_pd (__m256d __A, __m256d __B)
    348 {
    349   return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
    350 }
    351 
    352 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    353 _mm256_xor_ps (__m256 __A, __m256 __B)
    354 {
    355   return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
    356 }
    357 
    358 #ifdef __OPTIMIZE__
    359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    360 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
    361 {
    362   return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
    363 }
    364 
    365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    366 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
    367 {
    368   return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
    369 }
    370 
    371 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    372 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
    373 {
    374   return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
    375 					    __P);
    376 }
    377 
    378 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    379 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
    380 {
    381   return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
    382 					   __P);
    383 }
    384 
    385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    386 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
    387 {
    388   return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
    389 }
    390 
    391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    392 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
    393 {
    394   return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
    395 }
    396 #else
    397 #define _mm_cmp_pd(X, Y, P)						\
    398   ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),		\
    399 				   (__v2df)(__m128d)(Y), (int)(P)))
    400 
    401 #define _mm_cmp_ps(X, Y, P)						\
    402   ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),			\
    403 				  (__v4sf)(__m128)(Y), (int)(P)))
    404 
    405 #define _mm256_cmp_pd(X, Y, P)						\
    406   ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),		\
    407 				      (__v4df)(__m256d)(Y), (int)(P)))
    408 
    409 #define _mm256_cmp_ps(X, Y, P)						\
    410   ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),		\
    411 				     (__v8sf)(__m256)(Y), (int)(P)))
    412 
    413 #define _mm_cmp_sd(X, Y, P)						\
    414   ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),		\
    415 				   (__v2df)(__m128d)(Y), (int)(P)))
    416 
    417 #define _mm_cmp_ss(X, Y, P)						\
    418   ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),			\
    419 				  (__v4sf)(__m128)(Y), (int)(P)))
    420 #endif
    421 
    422 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    423 _mm256_cvtepi32_pd (__m128i __A)
    424 {
    425   return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
    426 }
    427 
    428 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    429 _mm256_cvtepi32_ps (__m256i __A)
    430 {
    431   return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
    432 }
    433 
    434 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    435 _mm256_cvtpd_ps (__m256d __A)
    436 {
    437   return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
    438 }
    439 
    440 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    441 _mm256_cvtps_epi32 (__m256 __A)
    442 {
    443   return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
    444 }
    445 
    446 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    447 _mm256_cvtps_pd (__m128 __A)
    448 {
    449   return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
    450 }
    451 
    452 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    453 _mm256_cvttpd_epi32 (__m256d __A)
    454 {
    455   return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
    456 }
    457 
    458 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    459 _mm256_cvtpd_epi32 (__m256d __A)
    460 {
    461   return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
    462 }
    463 
    464 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    465 _mm256_cvttps_epi32 (__m256 __A)
    466 {
    467   return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
    468 }
    469 
    470 #ifdef __OPTIMIZE__
    471 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    472 _mm256_extractf128_pd (__m256d __X, const int __N)
    473 {
    474   return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
    475 }
    476 
    477 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    478 _mm256_extractf128_ps (__m256 __X, const int __N)
    479 {
    480   return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
    481 }
    482 
    483 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    484 _mm256_extractf128_si256 (__m256i __X, const int __N)
    485 {
    486   return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
    487 }
    488 
    489 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    490 _mm256_extract_epi32 (__m256i __X, int const __N)
    491 {
    492   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
    493   return _mm_extract_epi32 (__Y, __N % 4);
    494 }
    495 
    496 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    497 _mm256_extract_epi16 (__m256i __X, int const __N)
    498 {
    499   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
    500   return _mm_extract_epi16 (__Y, __N % 8);
    501 }
    502 
    503 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    504 _mm256_extract_epi8 (__m256i __X, int const __N)
    505 {
    506   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
    507   return _mm_extract_epi8 (__Y, __N % 16);
    508 }
    509 
    510 #ifdef __x86_64__
    511 extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    512 _mm256_extract_epi64 (__m256i __X, const int __N)
    513 {
    514   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
    515   return _mm_extract_epi64 (__Y, __N % 2);
    516 }
    517 #endif
    518 #else
    519 #define _mm256_extractf128_pd(X, N)					\
    520   ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),	\
    521 						(int)(N)))
    522 
    523 #define _mm256_extractf128_ps(X, N)					\
    524   ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),	\
    525 					       (int)(N)))
    526 
    527 #define _mm256_extractf128_si256(X, N)					\
    528   ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),	\
    529 						(int)(N)))
    530 
    531 #define _mm256_extract_epi32(X, N)					\
    532   (__extension__							\
    533    ({									\
    534       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
    535       _mm_extract_epi32 (__Y, (N) % 4);					\
    536     }))
    537 
    538 #define _mm256_extract_epi16(X, N)					\
    539   (__extension__							\
    540    ({									\
    541       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
    542       _mm_extract_epi16 (__Y, (N) % 8);					\
    543     }))
    544 
    545 #define _mm256_extract_epi8(X, N)					\
    546   (__extension__							\
    547    ({									\
    548       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
    549       _mm_extract_epi8 (__Y, (N) % 16);					\
    550     }))
    551 
    552 #ifdef __x86_64__
    553 #define _mm256_extract_epi64(X, N)					\
    554   (__extension__							\
    555    ({									\
    556       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
    557       _mm_extract_epi64 (__Y, (N) % 2);					\
    558     }))
    559 #endif
    560 #endif
    561 
    562 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    563 _mm256_zeroall (void)
    564 {
    565   __builtin_ia32_vzeroall ();
    566 }
    567 
    568 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    569 _mm256_zeroupper (void)
    570 {
    571   __builtin_ia32_vzeroupper ();
    572 }
    573 
    574 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    575 _mm_permutevar_pd (__m128d __A, __m128i __C)
    576 {
    577   return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
    578 						(__v2di)__C);
    579 }
    580 
    581 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    582 _mm256_permutevar_pd (__m256d __A, __m256i __C)
    583 {
    584   return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
    585 						   (__v4di)__C);
    586 }
    587 
    588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    589 _mm_permutevar_ps (__m128 __A, __m128i __C)
    590 {
    591   return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
    592 					       (__v4si)__C);
    593 }
    594 
    595 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    596 _mm256_permutevar_ps (__m256 __A, __m256i __C)
    597 {
    598   return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
    599 						  (__v8si)__C);
    600 }
    601 
    602 #ifdef __OPTIMIZE__
    603 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    604 _mm_permute_pd (__m128d __X, const int __C)
    605 {
    606   return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
    607 }
    608 
    609 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    610 _mm256_permute_pd (__m256d __X, const int __C)
    611 {
    612   return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
    613 }
    614 
    615 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    616 _mm_permute_ps (__m128 __X, const int __C)
    617 {
    618   return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
    619 }
    620 
    621 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    622 _mm256_permute_ps (__m256 __X, const int __C)
    623 {
    624   return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
    625 }
    626 #else
    627 #define _mm_permute_pd(X, C)						\
    628   ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
    629 
    630 #define _mm256_permute_pd(X, C)						\
    631   ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))
    632 
    633 #define _mm_permute_ps(X, C)						\
    634   ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
    635 
    636 #define _mm256_permute_ps(X, C)						\
    637   ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
    638 #endif
    639 
    640 #ifdef __OPTIMIZE__
    641 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    642 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
    643 {
    644   return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
    645 						    (__v4df)__Y,
    646 						    __C);
    647 }
    648 
    649 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    650 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
    651 {
    652   return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
    653 						   (__v8sf)__Y,
    654 						   __C);
    655 }
    656 
    657 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    658 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
    659 {
    660   return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
    661 						    (__v8si)__Y,
    662 						    __C);
    663 }
    664 #else
    665 #define _mm256_permute2f128_pd(X, Y, C)					\
    666   ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
    667 					      (__v4df)(__m256d)(Y),	\
    668 					      (int)(C)))
    669 
    670 #define _mm256_permute2f128_ps(X, Y, C)					\
    671   ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
    672 					     (__v8sf)(__m256)(Y),	\
    673 					     (int)(C)))
    674 
    675 #define _mm256_permute2f128_si256(X, Y, C)				\
    676   ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
    677 					      (__v8si)(__m256i)(Y),	\
    678 					      (int)(C)))
    679 #endif
    680 
    681 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    682 _mm_broadcast_ss (float const *__X)
    683 {
    684   return (__m128) __builtin_ia32_vbroadcastss (__X);
    685 }
    686 
    687 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    688 _mm256_broadcast_sd (double const *__X)
    689 {
    690   return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
    691 }
    692 
    693 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    694 _mm256_broadcast_ss (float const *__X)
    695 {
    696   return (__m256) __builtin_ia32_vbroadcastss256 (__X);
    697 }
    698 
    699 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    700 _mm256_broadcast_pd (__m128d const *__X)
    701 {
    702   return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
    703 }
    704 
    705 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    706 _mm256_broadcast_ps (__m128 const *__X)
    707 {
    708   return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
    709 }
    710 
    711 #ifdef __OPTIMIZE__
    712 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    713 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
    714 {
    715   return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
    716 						     (__v2df)__Y,
    717 						     __O);
    718 }
    719 
    720 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    721 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
    722 {
    723   return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
    724 						    (__v4sf)__Y,
    725 						    __O);
    726 }
    727 
    728 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    729 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
    730 {
    731   return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
    732 						     (__v4si)__Y,
    733 						     __O);
    734 }
    735 
    736 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    737 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
    738 {
    739   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
    740   __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
    741   return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
    742 }
    743 
    744 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    745 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
    746 {
    747   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
    748   __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
    749   return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
    750 }
    751 
    752 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    753 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
    754 {
    755   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
    756   __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
    757   return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
    758 }
    759 
    760 #ifdef __x86_64__
    761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    762 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
    763 {
    764   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
    765   __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
    766   return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
    767 }
    768 #endif
    769 #else
    770 #define _mm256_insertf128_pd(X, Y, O)					\
    771   ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),	\
    772 					       (__v2df)(__m128d)(Y),	\
    773 					       (int)(O)))
    774 
    775 #define _mm256_insertf128_ps(X, Y, O)					\
    776   ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),	\
    777 					      (__v4sf)(__m128)(Y),  	\
    778 					      (int)(O)))
    779 
    780 #define _mm256_insertf128_si256(X, Y, O)				\
    781   ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),	\
    782 					       (__v4si)(__m128i)(Y),	\
    783 					       (int)(O)))
    784 
    785 #define _mm256_insert_epi32(X, D, N)					\
    786   (__extension__							\
    787    ({									\
    788       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
    789       __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);			\
    790       _mm256_insertf128_si256 ((X), __Y, (N) >> 2);			\
    791     }))
    792 
    793 #define _mm256_insert_epi16(X, D, N)					\
    794   (__extension__							\
    795    ({									\
    796       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
    797       __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);			\
    798       _mm256_insertf128_si256 ((X), __Y, (N) >> 3);			\
    799     }))
    800 
    801 #define _mm256_insert_epi8(X, D, N)					\
    802   (__extension__							\
    803    ({									\
    804       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
    805       __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);			\
    806       _mm256_insertf128_si256 ((X), __Y, (N) >> 4);			\
    807     }))
    808 
    809 #ifdef __x86_64__
    810 #define _mm256_insert_epi64(X, D, N)					\
    811   (__extension__							\
    812    ({									\
    813       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
    814       __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);			\
    815       _mm256_insertf128_si256 ((X), __Y, (N) >> 1);			\
    816     }))
    817 #endif
    818 #endif
    819 
    820 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    821 _mm256_load_pd (double const *__P)
    822 {
    823   return *(__m256d *)__P;
    824 }
    825 
    826 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    827 _mm256_store_pd (double *__P, __m256d __A)
    828 {
    829   *(__m256d *)__P = __A;
    830 }
    831 
    832 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    833 _mm256_load_ps (float const *__P)
    834 {
    835   return *(__m256 *)__P;
    836 }
    837 
    838 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    839 _mm256_store_ps (float *__P, __m256 __A)
    840 {
    841   *(__m256 *)__P = __A;
    842 }
    843 
    844 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    845 _mm256_loadu_pd (double const *__P)
    846 {
    847   return (__m256d) __builtin_ia32_loadupd256 (__P);
    848 }
    849 
    850 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    851 _mm256_storeu_pd (double *__P, __m256d __A)
    852 {
    853   __builtin_ia32_storeupd256 (__P, (__v4df)__A);
    854 }
    855 
    856 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    857 _mm256_loadu_ps (float const *__P)
    858 {
    859   return (__m256) __builtin_ia32_loadups256 (__P);
    860 }
    861 
    862 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    863 _mm256_storeu_ps (float *__P, __m256 __A)
    864 {
    865   __builtin_ia32_storeups256 (__P, (__v8sf)__A);
    866 }
    867 
    868 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    869 _mm256_load_si256 (__m256i const *__P)
    870 {
    871   return *__P;
    872 }
    873 
    874 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    875 _mm256_store_si256 (__m256i *__P, __m256i __A)
    876 {
    877   *__P = __A;
    878 }
    879 
    880 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    881 _mm256_loadu_si256 (__m256i const *__P)
    882 {
    883   return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
    884 }
    885 
    886 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    887 _mm256_storeu_si256 (__m256i *__P, __m256i __A)
    888 {
    889   __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
    890 }
    891 
    892 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    893 _mm_maskload_pd (double const *__P, __m128i __M)
    894 {
    895   return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
    896 					      (__v2di)__M);
    897 }
    898 
    899 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    900 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
    901 {
    902   __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
    903 }
    904 
    905 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    906 _mm256_maskload_pd (double const *__P, __m256i __M)
    907 {
    908   return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
    909 						 (__v4di)__M);
    910 }
    911 
    912 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    913 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
    914 {
    915   __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
    916 }
    917 
    918 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    919 _mm_maskload_ps (float const *__P, __m128i __M)
    920 {
    921   return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
    922 					     (__v4si)__M);
    923 }
    924 
    925 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    926 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
    927 {
    928   __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
    929 }
    930 
    931 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    932 _mm256_maskload_ps (float const *__P, __m256i __M)
    933 {
    934   return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
    935 						(__v8si)__M);
    936 }
    937 
    938 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    939 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
    940 {
    941   __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
    942 }
    943 
    944 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    945 _mm256_movehdup_ps (__m256 __X)
    946 {
    947   return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
    948 }
    949 
    950 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    951 _mm256_moveldup_ps (__m256 __X)
    952 {
    953   return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
    954 }
    955 
    956 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    957 _mm256_movedup_pd (__m256d __X)
    958 {
    959   return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
    960 }
    961 
    962 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    963 _mm256_lddqu_si256 (__m256i const *__P)
    964 {
    965   return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
    966 }
    967 
    968 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    969 _mm256_stream_si256 (__m256i *__A, __m256i __B)
    970 {
    971   __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
    972 }
    973 
    974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    975 _mm256_stream_pd (double *__A, __m256d __B)
    976 {
    977   __builtin_ia32_movntpd256 (__A, (__v4df)__B);
    978 }
    979 
    980 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    981 _mm256_stream_ps (float *__P, __m256 __A)
    982 {
    983   __builtin_ia32_movntps256 (__P, (__v8sf)__A);
    984 }
    985 
    986 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    987 _mm256_rcp_ps (__m256 __A)
    988 {
    989   return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
    990 }
    991 
    992 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    993 _mm256_rsqrt_ps (__m256 __A)
    994 {
    995   return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
    996 }
    997 
    998 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    999 _mm256_sqrt_pd (__m256d __A)
   1000 {
   1001   return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
   1002 }
   1003 
   1004 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1005 _mm256_sqrt_ps (__m256 __A)
   1006 {
   1007   return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
   1008 }
   1009 
   1010 #ifdef __OPTIMIZE__
   1011 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1012 _mm256_round_pd (__m256d __V, const int __M)
   1013 {
   1014   return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
   1015 }
   1016 
   1017 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1018 _mm256_round_ps (__m256 __V, const int __M)
   1019 {
   1020   return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
   1021 }
   1022 #else
   1023 #define _mm256_round_pd(V, M) \
   1024   ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
   1025 
   1026 #define _mm256_round_ps(V, M) \
   1027   ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
   1028 #endif
   1029 
   1030 #define _mm256_ceil_pd(V)	_mm256_round_pd ((V), _MM_FROUND_CEIL)
   1031 #define _mm256_floor_pd(V)	_mm256_round_pd ((V), _MM_FROUND_FLOOR)
   1032 #define _mm256_ceil_ps(V)	_mm256_round_ps ((V), _MM_FROUND_CEIL)
   1033 #define _mm256_floor_ps(V)	_mm256_round_ps ((V), _MM_FROUND_FLOOR)
   1034 
   1035 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1036 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
   1037 {
   1038   return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
   1039 }
   1040 
   1041 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1042 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
   1043 {
   1044   return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
   1045 }
   1046 
   1047 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1048 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
   1049 {
   1050   return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
   1051 }
   1052 
   1053 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1054 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
   1055 {
   1056   return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
   1057 }
   1058 
   1059 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1060 _mm_testz_pd (__m128d __M, __m128d __V)
   1061 {
   1062   return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
   1063 }
   1064 
   1065 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1066 _mm_testc_pd (__m128d __M, __m128d __V)
   1067 {
   1068   return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
   1069 }
   1070 
   1071 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1072 _mm_testnzc_pd (__m128d __M, __m128d __V)
   1073 {
   1074   return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
   1075 }
   1076 
   1077 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1078 _mm_testz_ps (__m128 __M, __m128 __V)
   1079 {
   1080   return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
   1081 }
   1082 
   1083 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1084 _mm_testc_ps (__m128 __M, __m128 __V)
   1085 {
   1086   return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
   1087 }
   1088 
   1089 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1090 _mm_testnzc_ps (__m128 __M, __m128 __V)
   1091 {
   1092   return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
   1093 }
   1094 
   1095 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1096 _mm256_testz_pd (__m256d __M, __m256d __V)
   1097 {
   1098   return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
   1099 }
   1100 
   1101 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1102 _mm256_testc_pd (__m256d __M, __m256d __V)
   1103 {
   1104   return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
   1105 }
   1106 
   1107 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1108 _mm256_testnzc_pd (__m256d __M, __m256d __V)
   1109 {
   1110   return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
   1111 }
   1112 
   1113 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1114 _mm256_testz_ps (__m256 __M, __m256 __V)
   1115 {
   1116   return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
   1117 }
   1118 
   1119 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1120 _mm256_testc_ps (__m256 __M, __m256 __V)
   1121 {
   1122   return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
   1123 }
   1124 
   1125 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1126 _mm256_testnzc_ps (__m256 __M, __m256 __V)
   1127 {
   1128   return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
   1129 }
   1130 
   1131 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1132 _mm256_testz_si256 (__m256i __M, __m256i __V)
   1133 {
   1134   return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
   1135 }
   1136 
   1137 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1138 _mm256_testc_si256 (__m256i __M, __m256i __V)
   1139 {
   1140   return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
   1141 }
   1142 
   1143 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1144 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
   1145 {
   1146   return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
   1147 }
   1148 
   1149 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1150 _mm256_movemask_pd (__m256d __A)
   1151 {
   1152   return __builtin_ia32_movmskpd256 ((__v4df)__A);
   1153 }
   1154 
   1155 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1156 _mm256_movemask_ps (__m256 __A)
   1157 {
   1158   return __builtin_ia32_movmskps256 ((__v8sf)__A);
   1159 }
   1160 
   1161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1162 _mm256_setzero_pd (void)
   1163 {
   1164   return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
   1165 }
   1166 
   1167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1168 _mm256_setzero_ps (void)
   1169 {
   1170   return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
   1171 				 0.0, 0.0, 0.0, 0.0 };
   1172 }
   1173 
   1174 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1175 _mm256_setzero_si256 (void)
   1176 {
   1177   return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
   1178 }
   1179 
   1180 /* Create the vector [A B C D].  */
   1181 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1182 _mm256_set_pd (double __A, double __B, double __C, double __D)
   1183 {
   1184   return __extension__ (__m256d){ __D, __C, __B, __A };
   1185 }
   1186 
   1187 /* Create the vector [A B C D E F G H].  */
   1188 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1189 _mm256_set_ps (float __A, float __B, float __C, float __D,
   1190 	       float __E, float __F, float __G, float __H)
   1191 {
   1192   return __extension__ (__m256){ __H, __G, __F, __E,
   1193 				 __D, __C, __B, __A };
   1194 }
   1195 
   1196 /* Create the vector [A B C D E F G H].  */
   1197 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1198 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
   1199 		  int __E, int __F, int __G, int __H)
   1200 {
   1201   return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
   1202 					  __D, __C, __B, __A };
   1203 }
   1204 
   1205 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1206 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
   1207 		  short __q11, short __q10, short __q09, short __q08,
   1208 		  short __q07, short __q06, short __q05, short __q04,
   1209 		  short __q03, short __q02, short __q01, short __q00)
   1210 {
   1211   return __extension__ (__m256i)(__v16hi){
   1212     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
   1213     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
   1214   };
   1215 }
   1216 
   1217 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1218 _mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
   1219 		  char __q27, char __q26, char __q25, char __q24,
   1220 		  char __q23, char __q22, char __q21, char __q20,
   1221 		  char __q19, char __q18, char __q17, char __q16,
   1222 		  char __q15, char __q14, char __q13, char __q12,
   1223 		  char __q11, char __q10, char __q09, char __q08,
   1224 		  char __q07, char __q06, char __q05, char __q04,
   1225 		  char __q03, char __q02, char __q01, char __q00)
   1226 {
   1227   return __extension__ (__m256i)(__v32qi){
   1228     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
   1229     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
   1230     __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
   1231     __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
   1232   };
   1233 }
   1234 
   1235 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1236 _mm256_set_epi64x (long long __A, long long __B, long long __C,
   1237 		   long long __D)
   1238 {
   1239   return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
   1240 }
   1241 
   1242 /* Create a vector with all elements equal to A.  */
   1243 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1244 _mm256_set1_pd (double __A)
   1245 {
   1246   return __extension__ (__m256d){ __A, __A, __A, __A };
   1247 }
   1248 
   1249 /* Create a vector with all elements equal to A.  */
   1250 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1251 _mm256_set1_ps (float __A)
   1252 {
   1253   return __extension__ (__m256){ __A, __A, __A, __A,
   1254 				 __A, __A, __A, __A };
   1255 }
   1256 
   1257 /* Create a vector with all elements equal to A.  */
   1258 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1259 _mm256_set1_epi32 (int __A)
   1260 {
   1261   return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
   1262 					  __A, __A, __A, __A };
   1263 }
   1264 
   1265 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1266 _mm256_set1_epi16 (short __A)
   1267 {
   1268   return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
   1269 			   __A, __A, __A, __A, __A, __A, __A, __A);
   1270 }
   1271 
   1272 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1273 _mm256_set1_epi8 (char __A)
   1274 {
   1275   return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
   1276 			  __A, __A, __A, __A, __A, __A, __A, __A,
   1277 			  __A, __A, __A, __A, __A, __A, __A, __A,
   1278 			  __A, __A, __A, __A, __A, __A, __A, __A);
   1279 }
   1280 
   1281 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1282 _mm256_set1_epi64x (long long __A)
   1283 {
   1284   return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
   1285 }
   1286 
   1287 /* Create vectors of elements in the reversed order from the
   1288    _mm256_set_XXX functions.  */
   1289 
   1290 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1291 _mm256_setr_pd (double __A, double __B, double __C, double __D)
   1292 {
   1293   return _mm256_set_pd (__D, __C, __B, __A);
   1294 }
   1295 
   1296 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1297 _mm256_setr_ps (float __A, float __B, float __C, float __D,
   1298 		float __E, float __F, float __G, float __H)
   1299 {
   1300   return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
   1301 }
   1302 
   1303 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1304 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
   1305 		   int __E, int __F, int __G, int __H)
   1306 {
   1307   return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
   1308 }
   1309 
   1310 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1311 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
   1312 		   short __q11, short __q10, short __q09, short __q08,
   1313 		   short __q07, short __q06, short __q05, short __q04,
   1314 		   short __q03, short __q02, short __q01, short __q00)
   1315 {
   1316   return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
   1317 			   __q04, __q05, __q06, __q07,
   1318 			   __q08, __q09, __q10, __q11,
   1319 			   __q12, __q13, __q14, __q15);
   1320 }
   1321 
   1322 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1323 _mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
   1324 		   char __q27, char __q26, char __q25, char __q24,
   1325 		   char __q23, char __q22, char __q21, char __q20,
   1326 		   char __q19, char __q18, char __q17, char __q16,
   1327 		   char __q15, char __q14, char __q13, char __q12,
   1328 		   char __q11, char __q10, char __q09, char __q08,
   1329 		   char __q07, char __q06, char __q05, char __q04,
   1330 		   char __q03, char __q02, char __q01, char __q00)
   1331 {
   1332   return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
   1333 			  __q04, __q05, __q06, __q07,
   1334 			  __q08, __q09, __q10, __q11,
   1335 			  __q12, __q13, __q14, __q15,
   1336 			  __q16, __q17, __q18, __q19,
   1337 			  __q20, __q21, __q22, __q23,
   1338 			  __q24, __q25, __q26, __q27,
   1339 			  __q28, __q29, __q30, __q31);
   1340 }
   1341 
   1342 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1343 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
   1344 		    long long __D)
   1345 {
   1346   return _mm256_set_epi64x (__D, __C, __B, __A);
   1347 }
   1348 
   1349 /* Casts between various SP, DP, INT vector types.  Note that these do no
   1350    conversion of values, they just change the type.  */
   1351 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1352 _mm256_castpd_ps (__m256d __A)
   1353 {
   1354   return (__m256) __A;
   1355 }
   1356 
   1357 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1358 _mm256_castpd_si256 (__m256d __A)
   1359 {
   1360   return (__m256i) __A;
   1361 }
   1362 
   1363 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1364 _mm256_castps_pd (__m256 __A)
   1365 {
   1366   return (__m256d) __A;
   1367 }
   1368 
   1369 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1370 _mm256_castps_si256(__m256 __A)
   1371 {
   1372   return (__m256i) __A;
   1373 }
   1374 
   1375 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1376 _mm256_castsi256_ps (__m256i __A)
   1377 {
   1378   return (__m256) __A;
   1379 }
   1380 
   1381 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1382 _mm256_castsi256_pd (__m256i __A)
   1383 {
   1384   return (__m256d) __A;
   1385 }
   1386 
   1387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1388 _mm256_castpd256_pd128 (__m256d __A)
   1389 {
   1390   return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
   1391 }
   1392 
   1393 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1394 _mm256_castps256_ps128 (__m256 __A)
   1395 {
   1396   return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
   1397 }
   1398 
   1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1400 _mm256_castsi256_si128 (__m256i __A)
   1401 {
   1402   return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
   1403 }
   1404 
   1405 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
   1406    the 256-bit result contain source parameter value and the upper 128
   1407    bits of the result are undefined.  Those intrinsics shouldn't
   1408    generate any extra moves.  */
   1409 
   1410 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1411 _mm256_castpd128_pd256 (__m128d __A)
   1412 {
   1413   return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
   1414 }
   1415 
   1416 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1417 _mm256_castps128_ps256 (__m128 __A)
   1418 {
   1419   return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
   1420 }
   1421 
   1422 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1423 _mm256_castsi128_si256 (__m128i __A)
   1424 {
   1425   return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
   1426 }
   1427