Home | History | Annotate | Download | only in include
      1 /* Copyright (C) 2011-2014 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 #ifndef _IMMINTRIN_H_INCLUDED
     25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
     26 #endif
     27 
     28 #ifndef _AVX2INTRIN_H_INCLUDED
     29 #define _AVX2INTRIN_H_INCLUDED
     30 
     31 #ifndef __AVX2__
     32 #pragma GCC push_options
     33 #pragma GCC target("avx2")
     34 #define __DISABLE_AVX2__
     35 #endif /* __AVX2__ */
     36 
     37 /* Sum absolute 8-bit integer difference of adjacent groups of 4
     38    byte integers in the first 2 operands.  Starting offsets within
     39    operands are determined by the 3rd mask operand.  */
     40 #ifdef __OPTIMIZE__
     41 extern __inline __m256i
     42 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     43 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
     44 {
     45   return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
     46 					      (__v32qi)__Y, __M);
     47 }
     48 #else
     49 #define _mm256_mpsadbw_epu8(X, Y, M)					\
     50   ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),		\
     51 					(__v32qi)(__m256i)(Y), (int)(M)))
     52 #endif
     53 
     54 extern __inline __m256i
     55 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     56 _mm256_abs_epi8 (__m256i __A)
     57 {
     58   return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
     59 }
     60 
     61 extern __inline __m256i
     62 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     63 _mm256_abs_epi16 (__m256i __A)
     64 {
     65   return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
     66 }
     67 
     68 extern __inline __m256i
     69 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     70 _mm256_abs_epi32 (__m256i __A)
     71 {
     72   return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
     73 }
     74 
     75 extern __inline __m256i
     76 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     77 _mm256_packs_epi32 (__m256i __A, __m256i __B)
     78 {
     79   return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
     80 }
     81 
     82 extern __inline __m256i
     83 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     84 _mm256_packs_epi16 (__m256i __A, __m256i __B)
     85 {
     86   return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
     87 }
     88 
     89 extern __inline __m256i
     90 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     91 _mm256_packus_epi32 (__m256i __A, __m256i __B)
     92 {
     93   return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
     94 }
     95 
     96 extern __inline __m256i
     97 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     98 _mm256_packus_epi16 (__m256i __A, __m256i __B)
     99 {
    100   return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
    101 }
    102 
    103 extern __inline __m256i
    104 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    105 _mm256_add_epi8 (__m256i __A, __m256i __B)
    106 {
    107   return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
    108 }
    109 
    110 extern __inline __m256i
    111 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    112 _mm256_add_epi16 (__m256i __A, __m256i __B)
    113 {
    114   return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
    115 }
    116 
    117 extern __inline __m256i
    118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    119 _mm256_add_epi32 (__m256i __A, __m256i __B)
    120 {
    121   return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
    122 }
    123 
    124 extern __inline __m256i
    125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    126 _mm256_add_epi64 (__m256i __A, __m256i __B)
    127 {
    128   return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
    129 }
    130 
    131 extern __inline __m256i
    132 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    133 _mm256_adds_epi8 (__m256i __A, __m256i __B)
    134 {
    135   return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
    136 }
    137 
    138 extern __inline __m256i
    139 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    140 _mm256_adds_epi16 (__m256i __A, __m256i __B)
    141 {
    142   return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
    143 }
    144 
    145 extern __inline __m256i
    146 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    147 _mm256_adds_epu8 (__m256i __A, __m256i __B)
    148 {
    149   return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
    150 }
    151 
    152 extern __inline __m256i
    153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    154 _mm256_adds_epu16 (__m256i __A, __m256i __B)
    155 {
    156   return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
    157 }
    158 
    159 #ifdef __OPTIMIZE__
    160 extern __inline __m256i
    161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    162 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
    163 {
    164   return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
    165 					      (__v4di)__B,
    166 					      __N * 8);
    167 }
    168 #else
    169 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
    170 /* Use define instead */
    171 #define _mm256_alignr_epi8(A, B, N)				   \
    172   ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),	   \
    173 					(__v4di)(__m256i)(B),	   \
    174 					(int)(N) * 8))
    175 #endif
    176 
    177 extern __inline __m256i
    178 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    179 _mm256_and_si256 (__m256i __A, __m256i __B)
    180 {
    181   return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
    182 }
    183 
    184 extern __inline __m256i
    185 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    186 _mm256_andnot_si256 (__m256i __A, __m256i __B)
    187 {
    188   return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
    189 }
    190 
    191 extern __inline __m256i
    192 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    193 _mm256_avg_epu8 (__m256i __A, __m256i __B)
    194 {
    195   return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
    196 }
    197 
    198 extern __inline __m256i
    199 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    200 _mm256_avg_epu16 (__m256i __A, __m256i __B)
    201 {
    202   return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
    203 }
    204 
    205 extern __inline __m256i
    206 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    207 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
    208 {
    209   return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
    210 					       (__v32qi)__Y,
    211 					       (__v32qi)__M);
    212 }
    213 
    214 #ifdef __OPTIMIZE__
    215 extern __inline __m256i
    216 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    217 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
    218 {
    219   return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
    220 					      (__v16hi)__Y,
    221 					       __M);
    222 }
    223 #else
    224 #define _mm256_blend_epi16(X, Y, M)					\
    225   ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),		\
    226 					(__v16hi)(__m256i)(Y), (int)(M)))
    227 #endif
    228 
    229 extern __inline __m256i
    230 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    231 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
    232 {
    233   return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
    234 }
    235 
    236 extern __inline __m256i
    237 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    238 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
    239 {
    240   return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
    241 }
    242 
    243 extern __inline __m256i
    244 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    245 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
    246 {
    247   return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
    248 }
    249 
    250 extern __inline __m256i
    251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    252 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
    253 {
    254   return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
    255 }
    256 
    257 extern __inline __m256i
    258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    259 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
    260 {
    261   return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
    262 					     (__v32qi)__B);
    263 }
    264 
    265 extern __inline __m256i
    266 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    267 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
    268 {
    269   return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
    270 					     (__v16hi)__B);
    271 }
    272 
    273 extern __inline __m256i
    274 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    275 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
    276 {
    277   return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
    278 					     (__v8si)__B);
    279 }
    280 
    281 extern __inline __m256i
    282 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    283 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
    284 {
    285   return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
    286 }
    287 
    288 extern __inline __m256i
    289 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    290 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
    291 {
    292   return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
    293 					     (__v16hi)__Y);
    294 }
    295 
    296 extern __inline __m256i
    297 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    298 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
    299 {
    300   return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
    301 }
    302 
    303 extern __inline __m256i
    304 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    305 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
    306 {
    307   return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
    308 					      (__v16hi)__Y);
    309 }
    310 
    311 extern __inline __m256i
    312 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    313 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
    314 {
    315   return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
    316 					     (__v16hi)__Y);
    317 }
    318 
    319 extern __inline __m256i
    320 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    321 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
    322 {
    323   return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
    324 }
    325 
    326 extern __inline __m256i
    327 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    328 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
    329 {
    330   return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
    331 					      (__v16hi)__Y);
    332 }
    333 
    334 extern __inline __m256i
    335 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    336 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
    337 {
    338   return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
    339 						(__v32qi)__Y);
    340 }
    341 
    342 extern __inline __m256i
    343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    344 _mm256_madd_epi16 (__m256i __A, __m256i __B)
    345 {
    346   return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
    347 					     (__v16hi)__B);
    348 }
    349 
    350 extern __inline __m256i
    351 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    352 _mm256_max_epi8 (__m256i __A, __m256i __B)
    353 {
    354   return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
    355 }
    356 
    357 extern __inline __m256i
    358 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    359 _mm256_max_epi16 (__m256i __A, __m256i __B)
    360 {
    361   return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
    362 }
    363 
    364 extern __inline __m256i
    365 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    366 _mm256_max_epi32 (__m256i __A, __m256i __B)
    367 {
    368   return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
    369 }
    370 
    371 extern __inline __m256i
    372 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    373 _mm256_max_epu8 (__m256i __A, __m256i __B)
    374 {
    375   return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
    376 }
    377 
    378 extern __inline __m256i
    379 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    380 _mm256_max_epu16 (__m256i __A, __m256i __B)
    381 {
    382   return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
    383 }
    384 
    385 extern __inline __m256i
    386 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    387 _mm256_max_epu32 (__m256i __A, __m256i __B)
    388 {
    389   return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
    390 }
    391 
    392 extern __inline __m256i
    393 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    394 _mm256_min_epi8 (__m256i __A, __m256i __B)
    395 {
    396   return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
    397 }
    398 
    399 extern __inline __m256i
    400 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    401 _mm256_min_epi16 (__m256i __A, __m256i __B)
    402 {
    403   return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
    404 }
    405 
    406 extern __inline __m256i
    407 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    408 _mm256_min_epi32 (__m256i __A, __m256i __B)
    409 {
    410   return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
    411 }
    412 
    413 extern __inline __m256i
    414 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    415 _mm256_min_epu8 (__m256i __A, __m256i __B)
    416 {
    417   return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
    418 }
    419 
    420 extern __inline __m256i
    421 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    422 _mm256_min_epu16 (__m256i __A, __m256i __B)
    423 {
    424   return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
    425 }
    426 
    427 extern __inline __m256i
    428 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    429 _mm256_min_epu32 (__m256i __A, __m256i __B)
    430 {
    431   return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
    432 }
    433 
    434 extern __inline int
    435 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    436 _mm256_movemask_epi8 (__m256i __A)
    437 {
    438   return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
    439 }
    440 
    441 extern __inline __m256i
    442 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    443 _mm256_cvtepi8_epi16 (__m128i __X)
    444 {
    445   return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
    446 }
    447 
    448 extern __inline __m256i
    449 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    450 _mm256_cvtepi8_epi32 (__m128i __X)
    451 {
    452   return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
    453 }
    454 
    455 extern __inline __m256i
    456 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    457 _mm256_cvtepi8_epi64 (__m128i __X)
    458 {
    459   return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
    460 }
    461 
    462 extern __inline __m256i
    463 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    464 _mm256_cvtepi16_epi32 (__m128i __X)
    465 {
    466   return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
    467 }
    468 
    469 extern __inline __m256i
    470 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    471 _mm256_cvtepi16_epi64 (__m128i __X)
    472 {
    473   return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
    474 }
    475 
    476 extern __inline __m256i
    477 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    478 _mm256_cvtepi32_epi64 (__m128i __X)
    479 {
    480   return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
    481 }
    482 
    483 extern __inline __m256i
    484 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    485 _mm256_cvtepu8_epi16 (__m128i __X)
    486 {
    487   return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
    488 }
    489 
    490 extern __inline __m256i
    491 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    492 _mm256_cvtepu8_epi32 (__m128i __X)
    493 {
    494   return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
    495 }
    496 
    497 extern __inline __m256i
    498 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    499 _mm256_cvtepu8_epi64 (__m128i __X)
    500 {
    501   return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
    502 }
    503 
    504 extern __inline __m256i
    505 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    506 _mm256_cvtepu16_epi32 (__m128i __X)
    507 {
    508   return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
    509 }
    510 
    511 extern __inline __m256i
    512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    513 _mm256_cvtepu16_epi64 (__m128i __X)
    514 {
    515   return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
    516 }
    517 
    518 extern __inline __m256i
    519 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    520 _mm256_cvtepu32_epi64 (__m128i __X)
    521 {
    522   return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
    523 }
    524 
    525 extern __inline __m256i
    526 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    527 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
    528 {
    529   return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
    530 }
    531 
    532 extern __inline __m256i
    533 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    534 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
    535 {
    536   return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
    537 					       (__v16hi)__Y);
    538 }
    539 
    540 extern __inline __m256i
    541 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    542 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
    543 {
    544   return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
    545 }
    546 
    547 extern __inline __m256i
    548 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    549 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
    550 {
    551   return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
    552 }
    553 
    554 extern __inline __m256i
    555 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    556 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
    557 {
    558   return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
    559 }
    560 
    561 extern __inline __m256i
    562 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    563 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
    564 {
    565   return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
    566 }
    567 
    568 extern __inline __m256i
    569 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    570 _mm256_mul_epu32 (__m256i __A, __m256i __B)
    571 {
    572   return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
    573 }
    574 
    575 extern __inline __m256i
    576 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    577 _mm256_or_si256 (__m256i __A, __m256i __B)
    578 {
    579   return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
    580 }
    581 
    582 extern __inline __m256i
    583 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    584 _mm256_sad_epu8 (__m256i __A, __m256i __B)
    585 {
    586   return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
    587 }
    588 
    589 extern __inline __m256i
    590 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    591 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
    592 {
    593   return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
    594 					     (__v32qi)__Y);
    595 }
    596 
    597 #ifdef __OPTIMIZE__
    598 extern __inline __m256i
    599 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    600 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
    601 {
    602   return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
    603 }
    604 
    605 extern __inline __m256i
    606 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    607 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
    608 {
    609   return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
    610 }
    611 
    612 extern __inline __m256i
    613 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    614 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
    615 {
    616   return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
    617 }
    618 #else
    619 #define _mm256_shuffle_epi32(A, N) \
    620   ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
    621 #define _mm256_shufflehi_epi16(A, N) \
    622   ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
    623 #define _mm256_shufflelo_epi16(A, N) \
    624   ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
    625 #endif
    626 
    627 extern __inline __m256i
    628 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    629 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
    630 {
    631   return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
    632 }
    633 
    634 extern __inline __m256i
    635 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    636 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
    637 {
    638   return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
    639 }
    640 
    641 extern __inline __m256i
    642 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    643 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
    644 {
    645   return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
    646 }
    647 
    648 #ifdef __OPTIMIZE__
    649 extern __inline __m256i
    650 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    651 _mm256_slli_si256 (__m256i __A, const int __N)
    652 {
    653   return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
    654 }
    655 #else
    656 #define _mm256_slli_si256(A, N) \
    657   ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
    658 #endif
    659 
    660 extern __inline __m256i
    661 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    662 _mm256_slli_epi16 (__m256i __A, int __B)
    663 {
    664   return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
    665 }
    666 
    667 extern __inline __m256i
    668 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    669 _mm256_sll_epi16 (__m256i __A, __m128i __B)
    670 {
    671   return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
    672 }
    673 
    674 extern __inline __m256i
    675 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    676 _mm256_slli_epi32 (__m256i __A, int __B)
    677 {
    678   return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
    679 }
    680 
    681 extern __inline __m256i
    682 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    683 _mm256_sll_epi32 (__m256i __A, __m128i __B)
    684 {
    685   return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
    686 }
    687 
    688 extern __inline __m256i
    689 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    690 _mm256_slli_epi64 (__m256i __A, int __B)
    691 {
    692   return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
    693 }
    694 
    695 extern __inline __m256i
    696 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    697 _mm256_sll_epi64 (__m256i __A, __m128i __B)
    698 {
    699   return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
    700 }
    701 
    702 extern __inline __m256i
    703 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    704 _mm256_srai_epi16 (__m256i __A, int __B)
    705 {
    706   return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
    707 }
    708 
    709 extern __inline __m256i
    710 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    711 _mm256_sra_epi16 (__m256i __A, __m128i __B)
    712 {
    713   return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
    714 }
    715 
    716 extern __inline __m256i
    717 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    718 _mm256_srai_epi32 (__m256i __A, int __B)
    719 {
    720   return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
    721 }
    722 
    723 extern __inline __m256i
    724 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    725 _mm256_sra_epi32 (__m256i __A, __m128i __B)
    726 {
    727   return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
    728 }
    729 
    730 #ifdef __OPTIMIZE__
    731 extern __inline __m256i
    732 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    733 _mm256_srli_si256 (__m256i __A, const int __N)
    734 {
    735   return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
    736 }
    737 #else
    738 #define _mm256_srli_si256(A, N) \
    739   ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
    740 #endif
    741 
    742 extern __inline __m256i
    743 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    744 _mm256_srli_epi16 (__m256i __A, int __B)
    745 {
    746   return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
    747 }
    748 
    749 extern __inline __m256i
    750 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    751 _mm256_srl_epi16 (__m256i __A, __m128i __B)
    752 {
    753   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
    754 }
    755 
    756 extern __inline __m256i
    757 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    758 _mm256_srli_epi32 (__m256i __A, int __B)
    759 {
    760   return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
    761 }
    762 
    763 extern __inline __m256i
    764 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    765 _mm256_srl_epi32 (__m256i __A, __m128i __B)
    766 {
    767   return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
    768 }
    769 
    770 extern __inline __m256i
    771 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    772 _mm256_srli_epi64 (__m256i __A, int __B)
    773 {
    774   return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
    775 }
    776 
    777 extern __inline __m256i
    778 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    779 _mm256_srl_epi64 (__m256i __A, __m128i __B)
    780 {
    781   return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
    782 }
    783 
    784 extern __inline __m256i
    785 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    786 _mm256_sub_epi8 (__m256i __A, __m256i __B)
    787 {
    788   return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
    789 }
    790 
    791 extern __inline __m256i
    792 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    793 _mm256_sub_epi16 (__m256i __A, __m256i __B)
    794 {
    795   return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
    796 }
    797 
    798 extern __inline __m256i
    799 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    800 _mm256_sub_epi32 (__m256i __A, __m256i __B)
    801 {
    802   return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
    803 }
    804 
    805 extern __inline __m256i
    806 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    807 _mm256_sub_epi64 (__m256i __A, __m256i __B)
    808 {
    809   return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
    810 }
    811 
    812 extern __inline __m256i
    813 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    814 _mm256_subs_epi8 (__m256i __A, __m256i __B)
    815 {
    816   return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
    817 }
    818 
    819 extern __inline __m256i
    820 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    821 _mm256_subs_epi16 (__m256i __A, __m256i __B)
    822 {
    823   return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
    824 }
    825 
    826 extern __inline __m256i
    827 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    828 _mm256_subs_epu8 (__m256i __A, __m256i __B)
    829 {
    830   return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
    831 }
    832 
    833 extern __inline __m256i
    834 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    835 _mm256_subs_epu16 (__m256i __A, __m256i __B)
    836 {
    837   return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
    838 }
    839 
    840 extern __inline __m256i
    841 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    842 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
    843 {
    844   return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
    845 }
    846 
    847 extern __inline __m256i
    848 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    849 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
    850 {
    851   return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
    852 }
    853 
    854 extern __inline __m256i
    855 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    856 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
    857 {
    858   return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
    859 }
    860 
    861 extern __inline __m256i
    862 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    863 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
    864 {
    865   return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
    866 }
    867 
    868 extern __inline __m256i
    869 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    870 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
    871 {
    872   return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
    873 }
    874 
    875 extern __inline __m256i
    876 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    877 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
    878 {
    879   return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
    880 }
    881 
    882 extern __inline __m256i
    883 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    884 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
    885 {
    886   return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
    887 }
    888 
    889 extern __inline __m256i
    890 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    891 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
    892 {
    893   return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
    894 }
    895 
    896 extern __inline __m256i
    897 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    898 _mm256_xor_si256 (__m256i __A, __m256i __B)
    899 {
    900   return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
    901 }
    902 
    903 extern __inline __m256i
    904 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    905 _mm256_stream_load_si256 (__m256i const *__X)
    906 {
    907   return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
    908 }
    909 
    910 extern __inline __m128
    911 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    912 _mm_broadcastss_ps (__m128 __X)
    913 {
    914   return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
    915 }
    916 
    917 extern __inline __m256
    918 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    919 _mm256_broadcastss_ps (__m128 __X)
    920 {
    921   return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
    922 }
    923 
    924 extern __inline __m256d
    925 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    926 _mm256_broadcastsd_pd (__m128d __X)
    927 {
    928   return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
    929 }
    930 
    931 extern __inline __m256i
    932 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    933 _mm256_broadcastsi128_si256 (__m128i __X)
    934 {
    935   return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
    936 }
    937 
    938 #ifdef __OPTIMIZE__
    939 extern __inline __m128i
    940 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    941 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
    942 {
    943   return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
    944 					      (__v4si)__Y,
    945 					      __M);
    946 }
    947 #else
    948 #define _mm_blend_epi32(X, Y, M)					\
    949   ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),		\
    950 					(__v4si)(__m128i)(Y), (int)(M)))
    951 #endif
    952 
    953 #ifdef __OPTIMIZE__
    954 extern __inline __m256i
    955 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    956 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
    957 {
    958   return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
    959 					      (__v8si)__Y,
    960 					      __M);
    961 }
    962 #else
    963 #define _mm256_blend_epi32(X, Y, M)					\
    964   ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),		\
    965 					(__v8si)(__m256i)(Y), (int)(M)))
    966 #endif
    967 
    968 extern __inline __m256i
    969 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    970 _mm256_broadcastb_epi8 (__m128i __X)
    971 {
    972   return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
    973 }
    974 
    975 extern __inline __m256i
    976 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    977 _mm256_broadcastw_epi16 (__m128i __X)
    978 {
    979   return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
    980 }
    981 
    982 extern __inline __m256i
    983 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    984 _mm256_broadcastd_epi32 (__m128i __X)
    985 {
    986   return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
    987 }
    988 
    989 extern __inline __m256i
    990 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    991 _mm256_broadcastq_epi64 (__m128i __X)
    992 {
    993   return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
    994 }
    995 
    996 extern __inline __m128i
    997 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    998 _mm_broadcastb_epi8 (__m128i __X)
    999 {
   1000   return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
   1001 }
   1002 
   1003 extern __inline __m128i
   1004 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1005 _mm_broadcastw_epi16 (__m128i __X)
   1006 {
   1007   return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
   1008 }
   1009 
   1010 extern __inline __m128i
   1011 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1012 _mm_broadcastd_epi32 (__m128i __X)
   1013 {
   1014   return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
   1015 }
   1016 
   1017 extern __inline __m128i
   1018 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1019 _mm_broadcastq_epi64 (__m128i __X)
   1020 {
   1021   return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
   1022 }
   1023 
   1024 extern __inline __m256i
   1025 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1026 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
   1027 {
   1028   return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
   1029 }
   1030 
   1031 #ifdef __OPTIMIZE__
   1032 extern __inline __m256d
   1033 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1034 _mm256_permute4x64_pd (__m256d __X, const int __M)
   1035 {
   1036   return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
   1037 }
   1038 #else
   1039 #define _mm256_permute4x64_pd(X, M)			       \
   1040   ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
   1041 #endif
   1042 
   1043 extern __inline __m256
   1044 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1045 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
   1046 {
   1047   return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
   1048 }
   1049 
   1050 #ifdef __OPTIMIZE__
   1051 extern __inline __m256i
   1052 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1053 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
   1054 {
   1055   return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
   1056 }
   1057 #else
   1058 #define _mm256_permute4x64_epi64(X, M)			       \
   1059   ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
   1060 #endif
   1061 
   1062 
   1063 #ifdef __OPTIMIZE__
   1064 extern __inline __m256i
   1065 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1066 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
   1067 {
   1068   return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
   1069 }
   1070 #else
   1071 #define _mm256_permute2x128_si256(X, Y, M)				\
   1072   ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
   1073 #endif
   1074 
   1075 #ifdef __OPTIMIZE__
   1076 extern __inline __m128i
   1077 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1078 _mm256_extracti128_si256 (__m256i __X, const int __M)
   1079 {
   1080   return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
   1081 }
   1082 #else
   1083 #define _mm256_extracti128_si256(X, M)				\
   1084   ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
   1085 #endif
   1086 
   1087 #ifdef __OPTIMIZE__
   1088 extern __inline __m256i
   1089 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1090 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
   1091 {
   1092   return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
   1093 }
   1094 #else
   1095 #define _mm256_inserti128_si256(X, Y, M)			 \
   1096   ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
   1097 					   (__v2di)(__m128i)(Y), \
   1098 					   (int)(M)))
   1099 #endif
   1100 
   1101 extern __inline __m256i
   1102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1103 _mm256_maskload_epi32 (int const *__X, __m256i __M )
   1104 {
   1105   return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
   1106 						(__v8si)__M);
   1107 }
   1108 
   1109 extern __inline __m256i
   1110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1111 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
   1112 {
   1113   return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
   1114 						(__v4di)__M);
   1115 }
   1116 
   1117 extern __inline __m128i
   1118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1119 _mm_maskload_epi32 (int const *__X, __m128i __M )
   1120 {
   1121   return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
   1122 					     (__v4si)__M);
   1123 }
   1124 
   1125 extern __inline __m128i
   1126 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1127 _mm_maskload_epi64 (long long const *__X, __m128i __M )
   1128 {
   1129   return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
   1130 					     (__v2di)__M);
   1131 }
   1132 
   1133 extern __inline void
   1134 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1135 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
   1136 {
   1137   __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
   1138 }
   1139 
   1140 extern __inline void
   1141 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1142 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
   1143 {
   1144   __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
   1145 }
   1146 
   1147 extern __inline void
   1148 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1149 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
   1150 {
   1151   __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
   1152 }
   1153 
   1154 extern __inline void
   1155 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1156 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
   1157 {
   1158   __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
   1159 }
   1160 
   1161 extern __inline __m256i
   1162 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1163 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
   1164 {
   1165   return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
   1166 }
   1167 
   1168 extern __inline __m128i
   1169 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1170 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
   1171 {
   1172   return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
   1173 }
   1174 
   1175 extern __inline __m256i
   1176 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1177 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
   1178 {
   1179   return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
   1180 }
   1181 
   1182 extern __inline __m128i
   1183 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1184 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
   1185 {
   1186   return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
   1187 }
   1188 
   1189 extern __inline __m256i
   1190 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1191 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
   1192 {
   1193   return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
   1194 }
   1195 
   1196 extern __inline __m128i
   1197 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1198 _mm_srav_epi32 (__m128i __X, __m128i __Y)
   1199 {
   1200   return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
   1201 }
   1202 
   1203 extern __inline __m256i
   1204 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1205 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
   1206 {
   1207   return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
   1208 }
   1209 
   1210 extern __inline __m128i
   1211 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1212 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
   1213 {
   1214   return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
   1215 }
   1216 
   1217 extern __inline __m256i
   1218 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1219 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
   1220 {
   1221   return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
   1222 }
   1223 
   1224 extern __inline __m128i
   1225 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1226 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
   1227 {
   1228   return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
   1229 }
   1230 
   1231 #ifdef __OPTIMIZE__
   1232 extern __inline __m128d
   1233 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1234 _mm_i32gather_pd (double const *base, __m128i index, const int scale)
   1235 {
   1236   __v2df zero = _mm_setzero_pd ();
   1237   __v2df mask = _mm_cmpeq_pd (zero, zero);
   1238 
   1239   return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
   1240 						base,
   1241 						(__v4si)index,
   1242 						mask,
   1243 						scale);
   1244 }
   1245 
   1246 extern __inline __m128d
   1247 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1248 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
   1249 		       __m128d mask, const int scale)
   1250 {
   1251   return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
   1252 						base,
   1253 						(__v4si)index,
   1254 						(__v2df)mask,
   1255 						scale);
   1256 }
   1257 
   1258 extern __inline __m256d
   1259 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1260 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
   1261 {
   1262   __v4df zero = _mm256_setzero_pd ();
   1263   __v4df mask = _mm256_cmp_pd (zero, zero, _CMP_EQ_OQ);
   1264 
   1265   return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
   1266 						base,
   1267 						(__v4si)index,
   1268 						mask,
   1269 						scale);
   1270 }
   1271 
   1272 extern __inline __m256d
   1273 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1274 _mm256_mask_i32gather_pd (__m256d src, double const *base,
   1275 			  __m128i index, __m256d mask, const int scale)
   1276 {
   1277   return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
   1278 						base,
   1279 						(__v4si)index,
   1280 						(__v4df)mask,
   1281 						scale);
   1282 }
   1283 
   1284 extern __inline __m128d
   1285 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1286 _mm_i64gather_pd (double const *base, __m128i index, const int scale)
   1287 {
   1288   __v2df src = _mm_setzero_pd ();
   1289   __v2df mask = _mm_cmpeq_pd (src, src);
   1290 
   1291   return (__m128d) __builtin_ia32_gatherdiv2df (src,
   1292 						base,
   1293 						(__v2di)index,
   1294 						mask,
   1295 						scale);
   1296 }
   1297 
   1298 extern __inline __m128d
   1299 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1300 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
   1301 		       __m128d mask, const int scale)
   1302 {
   1303   return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
   1304 						base,
   1305 						(__v2di)index,
   1306 						(__v2df)mask,
   1307 						scale);
   1308 }
   1309 
   1310 extern __inline __m256d
   1311 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1312 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
   1313 {
   1314   __v4df src = _mm256_setzero_pd ();
   1315   __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
   1316 
   1317   return (__m256d) __builtin_ia32_gatherdiv4df (src,
   1318 						base,
   1319 						(__v4di)index,
   1320 						mask,
   1321 						scale);
   1322 }
   1323 
   1324 extern __inline __m256d
   1325 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1326 _mm256_mask_i64gather_pd (__m256d src, double const *base,
   1327 			  __m256i index, __m256d mask, const int scale)
   1328 {
   1329   return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
   1330 						base,
   1331 						(__v4di)index,
   1332 						(__v4df)mask,
   1333 						scale);
   1334 }
   1335 
   1336 extern __inline __m128
   1337 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1338 _mm_i32gather_ps (float const *base, __m128i index, const int scale)
   1339 {
   1340   __v4sf src = _mm_setzero_ps ();
   1341   __v4sf mask = _mm_cmpeq_ps (src, src);
   1342 
   1343   return (__m128) __builtin_ia32_gathersiv4sf (src,
   1344 					       base,
   1345 					       (__v4si)index,
   1346 					       mask,
   1347 					       scale);
   1348 }
   1349 
   1350 extern __inline __m128
   1351 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1352 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
   1353 		       __m128 mask, const int scale)
   1354 {
   1355   return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
   1356 					       base,
   1357 					       (__v4si)index,
   1358 					       (__v4sf)mask,
   1359 					       scale);
   1360 }
   1361 
   1362 extern __inline __m256
   1363 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1364 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
   1365 {
   1366   __v8sf src = _mm256_setzero_ps ();
   1367   __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
   1368 
   1369   return (__m256) __builtin_ia32_gathersiv8sf (src,
   1370 					       base,
   1371 					       (__v8si)index,
   1372 					       mask,
   1373 					       scale);
   1374 }
   1375 
   1376 extern __inline __m256
   1377 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1378 _mm256_mask_i32gather_ps (__m256 src, float const *base,
   1379 			  __m256i index, __m256 mask, const int scale)
   1380 {
   1381   return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
   1382 					       base,
   1383 					       (__v8si)index,
   1384 					       (__v8sf)mask,
   1385 					       scale);
   1386 }
   1387 
   1388 extern __inline __m128
   1389 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1390 _mm_i64gather_ps (float const *base, __m128i index, const int scale)
   1391 {
   1392   __v4sf src = _mm_setzero_ps ();
   1393   __v4sf mask = _mm_cmpeq_ps (src, src);
   1394 
   1395   return (__m128) __builtin_ia32_gatherdiv4sf (src,
   1396 					       base,
   1397 					       (__v2di)index,
   1398 					       mask,
   1399 					       scale);
   1400 }
   1401 
   1402 extern __inline __m128
   1403 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1404 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
   1405 		       __m128 mask, const int scale)
   1406 {
   1407   return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
   1408 						base,
   1409 						(__v2di)index,
   1410 						(__v4sf)mask,
   1411 						scale);
   1412 }
   1413 
   1414 extern __inline __m128
   1415 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1416 _mm256_i64gather_ps (float const *base, __m256i index, const int scale)
   1417 {
   1418   __v4sf src = _mm_setzero_ps ();
   1419   __v4sf mask = _mm_cmpeq_ps (src, src);
   1420 
   1421   return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
   1422 						  base,
   1423 						  (__v4di)index,
   1424 						  mask,
   1425 						  scale);
   1426 }
   1427 
   1428 extern __inline __m128
   1429 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1430 _mm256_mask_i64gather_ps (__m128 src, float const *base,
   1431 			  __m256i index, __m128 mask, const int scale)
   1432 {
   1433   return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
   1434 						  base,
   1435 						  (__v4di)index,
   1436 						  (__v4sf)mask,
   1437 						  scale);
   1438 }
   1439 
   1440 extern __inline __m128i
   1441 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1442 _mm_i32gather_epi64 (long long int const *base,
   1443 		     __m128i index, const int scale)
   1444 {
   1445   __v2di src = __extension__ (__v2di){ 0, 0 };
   1446   __v2di mask = __extension__ (__v2di){ ~0, ~0 };
   1447 
   1448   return (__m128i) __builtin_ia32_gathersiv2di (src,
   1449 						base,
   1450 						(__v4si)index,
   1451 						mask,
   1452 						scale);
   1453 }
   1454 
   1455 extern __inline __m128i
   1456 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1457 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
   1458 			  __m128i index, __m128i mask, const int scale)
   1459 {
   1460   return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
   1461 						base,
   1462 						(__v4si)index,
   1463 						(__v2di)mask,
   1464 						scale);
   1465 }
   1466 
   1467 extern __inline __m256i
   1468 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1469 _mm256_i32gather_epi64 (long long int const *base,
   1470 			__m128i index, const int scale)
   1471 {
   1472   __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
   1473   __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
   1474 
   1475   return (__m256i) __builtin_ia32_gathersiv4di (src,
   1476 						base,
   1477 						(__v4si)index,
   1478 						mask,
   1479 						scale);
   1480 }
   1481 
   1482 extern __inline __m256i
   1483 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1484 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
   1485 			     __m128i index, __m256i mask, const int scale)
   1486 {
   1487   return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
   1488 						base,
   1489 						(__v4si)index,
   1490 						(__v4di)mask,
   1491 						scale);
   1492 }
   1493 
   1494 extern __inline __m128i
   1495 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1496 _mm_i64gather_epi64 (long long int const *base,
   1497 		     __m128i index, const int scale)
   1498 {
   1499   __v2di src = __extension__ (__v2di){ 0, 0 };
   1500   __v2di mask = __extension__ (__v2di){ ~0, ~0 };
   1501 
   1502   return (__m128i) __builtin_ia32_gatherdiv2di (src,
   1503 						base,
   1504 						(__v2di)index,
   1505 						mask,
   1506 						scale);
   1507 }
   1508 
   1509 extern __inline __m128i
   1510 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1511 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
   1512 			  __m128i mask, const int scale)
   1513 {
   1514   return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
   1515 						base,
   1516 						(__v2di)index,
   1517 						(__v2di)mask,
   1518 						scale);
   1519 }
   1520 
   1521 extern __inline __m256i
   1522 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1523 _mm256_i64gather_epi64 (long long int const *base,
   1524 			__m256i index, const int scale)
   1525 {
   1526   __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
   1527   __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
   1528 
   1529   return (__m256i) __builtin_ia32_gatherdiv4di (src,
   1530 						base,
   1531 						(__v4di)index,
   1532 						mask,
   1533 						scale);
   1534 }
   1535 
   1536 extern __inline __m256i
   1537 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1538 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
   1539 			     __m256i index, __m256i mask, const int scale)
   1540 {
   1541   return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
   1542 						base,
   1543 						(__v4di)index,
   1544 						(__v4di)mask,
   1545 						scale);
   1546 }
   1547 
   1548 extern __inline __m128i
   1549 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1550 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
   1551 {
   1552   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
   1553   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
   1554 
   1555   return (__m128i) __builtin_ia32_gathersiv4si (src,
   1556 					       base,
   1557 					       (__v4si)index,
   1558 					       mask,
   1559 					       scale);
   1560 }
   1561 
   1562 extern __inline __m128i
   1563 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1564 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
   1565 			  __m128i mask, const int scale)
   1566 {
   1567   return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
   1568 						base,
   1569 						(__v4si)index,
   1570 						(__v4si)mask,
   1571 						scale);
   1572 }
   1573 
   1574 extern __inline __m256i
   1575 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1576 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
   1577 {
   1578   __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
   1579   __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
   1580 
   1581   return (__m256i) __builtin_ia32_gathersiv8si (src,
   1582 						base,
   1583 						(__v8si)index,
   1584 						mask,
   1585 						scale);
   1586 }
   1587 
   1588 extern __inline __m256i
   1589 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1590 _mm256_mask_i32gather_epi32 (__m256i src, int const *base,
   1591 			     __m256i index, __m256i mask, const int scale)
   1592 {
   1593   return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
   1594 						base,
   1595 						(__v8si)index,
   1596 						(__v8si)mask,
   1597 						scale);
   1598 }
   1599 
   1600 extern __inline __m128i
   1601 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1602 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
   1603 {
   1604   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
   1605   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
   1606 
   1607   return (__m128i) __builtin_ia32_gatherdiv4si (src,
   1608 						base,
   1609 						(__v2di)index,
   1610 						mask,
   1611 						scale);
   1612 }
   1613 
   1614 extern __inline __m128i
   1615 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1616 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
   1617 			  __m128i mask, const int scale)
   1618 {
   1619   return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
   1620 						base,
   1621 						(__v2di)index,
   1622 						(__v4si)mask,
   1623 						scale);
   1624 }
   1625 
   1626 extern __inline __m128i
   1627 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1628 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
   1629 {
   1630   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
   1631   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
   1632 
   1633   return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
   1634 						  base,
   1635 						  (__v4di)index,
   1636 						  mask,
   1637 						  scale);
   1638 }
   1639 
   1640 extern __inline __m128i
   1641 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1642 _mm256_mask_i64gather_epi32 (__m128i src, int const *base,
   1643 			     __m256i index, __m128i mask, const int scale)
   1644 {
   1645   return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
   1646 						   base,
   1647 						   (__v4di)index,
   1648 						   (__v4si)mask,
   1649 						   scale);
   1650 }
   1651 #else /* __OPTIMIZE__ */
   1652 #define _mm_i32gather_pd(BASE, INDEX, SCALE)				\
   1653   (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),	\
   1654 					 (double const *)BASE,		\
   1655 					 (__v4si)(__m128i)INDEX,	\
   1656 					 (__v2df)_mm_set1_pd(		\
   1657 					   (double)(long long int) -1), \
   1658 					 (int)SCALE)
   1659 
   1660 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1661   (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,	 \
   1662 					 (double const *)BASE,	 \
   1663 					 (__v4si)(__m128i)INDEX, \
   1664 					 (__v2df)(__m128d)MASK,	 \
   1665 					 (int)SCALE)
   1666 
   1667 #define _mm256_i32gather_pd(BASE, INDEX, SCALE)				\
   1668   (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),	\
   1669 					 (double const *)BASE,		\
   1670 					 (__v4si)(__m128i)INDEX,	\
   1671 					 (__v4df)_mm256_set1_pd(	\
   1672 					   (double)(long long int) -1), \
   1673 					 (int)SCALE)
   1674 
   1675 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1676   (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,	 \
   1677 					 (double const *)BASE,	 \
   1678 					 (__v4si)(__m128i)INDEX, \
   1679 					 (__v4df)(__m256d)MASK,	 \
   1680 					 (int)SCALE)
   1681 
   1682 #define _mm_i64gather_pd(BASE, INDEX, SCALE)				\
   1683   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),	\
   1684 					 (double const *)BASE,		\
   1685 					 (__v2di)(__m128i)INDEX,	\
   1686 					 (__v2df)_mm_set1_pd(		\
   1687 					   (double)(long long int) -1), \
   1688 					 (int)SCALE)
   1689 
   1690 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1691   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,	 \
   1692 					 (double const *)BASE,	 \
   1693 					 (__v2di)(__m128i)INDEX, \
   1694 					 (__v2df)(__m128d)MASK,	 \
   1695 					 (int)SCALE)
   1696 
   1697 #define _mm256_i64gather_pd(BASE, INDEX, SCALE)				\
   1698   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),	\
   1699 					 (double const *)BASE,		\
   1700 					 (__v4di)(__m256i)INDEX,	\
   1701 					 (__v4df)_mm256_set1_pd(	\
   1702 					   (double)(long long int) -1), \
   1703 					 (int)SCALE)
   1704 
   1705 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1706   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,	 \
   1707 					 (double const *)BASE,	 \
   1708 					 (__v4di)(__m256i)INDEX, \
   1709 					 (__v4df)(__m256d)MASK,	 \
   1710 					 (int)SCALE)
   1711 
   1712 #define _mm_i32gather_ps(BASE, INDEX, SCALE)				\
   1713   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),	\
   1714 					(float const *)BASE,		\
   1715 					(__v4si)(__m128i)INDEX,		\
   1716 					_mm_set1_ps ((float)(int) -1),	\
   1717 					(int)SCALE)
   1718 
   1719 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
   1720   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,	 \
   1721 					(float const *)BASE,	 \
   1722 					(__v4si)(__m128i)INDEX,	 \
   1723 					(__v4sf)(__m128d)MASK,	 \
   1724 					(int)SCALE)
   1725 
   1726 #define _mm256_i32gather_ps(BASE, INDEX, SCALE)			       \
   1727   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
   1728 					(float const *)BASE,	       \
   1729 					(__v8si)(__m256i)INDEX,	       \
   1730 					(__v8sf)_mm256_set1_ps (       \
   1731 					  (float)(int) -1),	       \
   1732 					(int)SCALE)
   1733 
   1734 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
   1735   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,	\
   1736 					(float const *)BASE,	\
   1737 					(__v8si)(__m256i)INDEX, \
   1738 					(__v8sf)(__m256d)MASK,	\
   1739 					(int)SCALE)
   1740 
   1741 #define _mm_i64gather_ps(BASE, INDEX, SCALE)				\
   1742   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),	\
   1743 					(float const *)BASE,		\
   1744 					(__v2di)(__m128i)INDEX,		\
   1745 					(__v4sf)_mm_set1_ps (		\
   1746 					  (float)(int) -1),		\
   1747 					(int)SCALE)
   1748 
   1749 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
   1750   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,	 \
   1751 					(float const *)BASE,	 \
   1752 					(__v2di)(__m128i)INDEX,	 \
   1753 					(__v4sf)(__m128d)MASK,	 \
   1754 					(int)SCALE)
   1755 
   1756 #define _mm256_i64gather_ps(BASE, INDEX, SCALE)				\
   1757   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (),	\
   1758 					   (float const *)BASE,		\
   1759 					   (__v4di)(__m256i)INDEX,	\
   1760 					   (__v4sf)_mm_set1_ps(		\
   1761 					     (float)(int) -1),		\
   1762 					   (int)SCALE)
   1763 
   1764 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	   \
   1765   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,	   \
   1766 					   (float const *)BASE,	   \
   1767 					   (__v4di)(__m256i)INDEX, \
   1768 					   (__v4sf)(__m128)MASK,   \
   1769 					   (int)SCALE)
   1770 
   1771 #define _mm_i32gather_epi64(BASE, INDEX, SCALE)				\
   1772   (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
   1773 					 (long long const *)BASE,	\
   1774 					 (__v4si)(__m128i)INDEX,	\
   1775 					 (__v2di)_mm_set1_epi64x (-1),	\
   1776 					 (int)SCALE)
   1777 
   1778 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
   1779   (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC,	  \
   1780 					 (long long const *)BASE, \
   1781 					 (__v4si)(__m128i)INDEX,  \
   1782 					 (__v2di)(__m128i)MASK,	  \
   1783 					 (int)SCALE)
   1784 
   1785 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE)			   \
   1786   (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
   1787 					 (long long const *)BASE,	   \
   1788 					 (__v4si)(__m128i)INDEX,	   \
   1789 					 (__v4di)_mm256_set1_epi64x (-1),  \
   1790 					 (int)SCALE)
   1791 
   1792 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
   1793   (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC,	   \
   1794 					 (long long const *)BASE,  \
   1795 					 (__v4si)(__m128i)INDEX,   \
   1796 					 (__v4di)(__m256i)MASK,	   \
   1797 					 (int)SCALE)
   1798 
   1799 #define _mm_i64gather_epi64(BASE, INDEX, SCALE)				\
   1800   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
   1801 					 (long long const *)BASE,	\
   1802 					 (__v2di)(__m128i)INDEX,	\
   1803 					 (__v2di)_mm_set1_epi64x (-1),	\
   1804 					 (int)SCALE)
   1805 
   1806 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
   1807   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC,	  \
   1808 					 (long long const *)BASE, \
   1809 					 (__v2di)(__m128i)INDEX,  \
   1810 					 (__v2di)(__m128i)MASK,	  \
   1811 					 (int)SCALE)
   1812 
   1813 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE)			   \
   1814   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
   1815 					 (long long const *)BASE,	   \
   1816 					 (__v4di)(__m256i)INDEX,	   \
   1817 					 (__v4di)_mm256_set1_epi64x (-1),  \
   1818 					 (int)SCALE)
   1819 
   1820 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
   1821   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC,	   \
   1822 					 (long long const *)BASE,  \
   1823 					 (__v4di)(__m256i)INDEX,   \
   1824 					 (__v4di)(__m256i)MASK,	   \
   1825 					 (int)SCALE)
   1826 
   1827 #define _mm_i32gather_epi32(BASE, INDEX, SCALE)				\
   1828   (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (),	\
   1829 					 (int const *)BASE,		\
   1830 					 (__v4si)(__m128i)INDEX,	\
   1831 					 (__v4si)_mm_set1_epi32 (-1),	\
   1832 					 (int)SCALE)
   1833 
   1834 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1835   (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC,	\
   1836 					(int const *)BASE,	\
   1837 					(__v4si)(__m128i)INDEX, \
   1838 					(__v4si)(__m128i)MASK,	\
   1839 					(int)SCALE)
   1840 
   1841 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE)			   \
   1842   (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
   1843 					 (int const *)BASE,		   \
   1844 					 (__v8si)(__m256i)INDEX,	   \
   1845 					 (__v8si)_mm256_set1_epi32 (-1),   \
   1846 					 (int)SCALE)
   1847 
   1848 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1849   (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC,	   \
   1850 					(int const *)BASE,	   \
   1851 					(__v8si)(__m256i)INDEX,	   \
   1852 					(__v8si)(__m256i)MASK,	   \
   1853 					(int)SCALE)
   1854 
   1855 #define _mm_i64gather_epi32(BASE, INDEX, SCALE)				\
   1856   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (),	\
   1857 					 (int const *)BASE,		\
   1858 					 (__v2di)(__m128i)INDEX,	\
   1859 					 (__v4si)_mm_set1_epi32 (-1),	\
   1860 					 (int)SCALE)
   1861 
   1862 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1863   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC,	\
   1864 					(int const *)BASE,	\
   1865 					(__v2di)(__m128i)INDEX, \
   1866 					(__v4si)(__m128i)MASK,	\
   1867 					(int)SCALE)
   1868 
   1869 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE)			   \
   1870   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
   1871 					    (int const *)BASE,		   \
   1872 					    (__v4di)(__m256i)INDEX,	   \
   1873 					    (__v4si)_mm_set1_epi32(-1),	   \
   1874 					    (int)SCALE)
   1875 
   1876 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1877   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC,  \
   1878 					   (int const *)BASE,	   \
   1879 					   (__v4di)(__m256i)INDEX, \
   1880 					   (__v4si)(__m128i)MASK,  \
   1881 					   (int)SCALE)
   1882 #endif  /* __OPTIMIZE__ */
   1883 
   1884 #ifdef __DISABLE_AVX2__
   1885 #undef __DISABLE_AVX2__
   1886 #pragma GCC pop_options
   1887 #endif /* __DISABLE_AVX2__ */
   1888 
   1889 #endif /* _AVX2INTRIN_H_INCLUDED */
   1890