Home | History | Annotate | Download | only in include
      1 /* Copyright (C) 2011
      2    Free Software Foundation, Inc.
      3 
      4    This file is part of GCC.
      5 
      6    GCC is free software; you can redistribute it and/or modify
      7    it under the terms of the GNU General Public License as published by
      8    the Free Software Foundation; either version 3, or (at your option)
      9    any later version.
     10 
     11    GCC is distributed in the hope that it will be useful,
     12    but WITHOUT ANY WARRANTY; without even the implied warranty of
     13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14    GNU General Public License for more details.
     15 
     16    Under Section 7 of GPL version 3, you are granted additional
     17    permissions described in the GCC Runtime Library Exception, version
     18    3.1, as published by the Free Software Foundation.
     19 
     20    You should have received a copy of the GNU General Public License and
     21    a copy of the GCC Runtime Library Exception along with this program;
     22    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23    <http://www.gnu.org/licenses/>.  */
     24 
     25 #ifndef _IMMINTRIN_H_INCLUDED
     26 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
     27 #endif
     28 
     29 /* Sum absolute 8-bit integer difference of adjacent groups of 4
     30    byte integers in the first 2 operands.  Starting offsets within
     31    operands are determined by the 3rd mask operand.  */
     32 #ifdef __OPTIMIZE__
     33 extern __inline __m256i
     34 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     35 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
     36 {
     37   return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
     38 					      (__v32qi)__Y, __M);
     39 }
     40 #else
     41 #define _mm256_mpsadbw_epu8(X, Y, M)					\
     42   ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),		\
     43 					(__v32qi)(__m256i)(Y), (int)(M)))
     44 #endif
     45 
     46 extern __inline __m256i
     47 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     48 _mm256_abs_epi8 (__m256i __A)
     49 {
     50   return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
     51 }
     52 
     53 extern __inline __m256i
     54 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     55 _mm256_abs_epi16 (__m256i __A)
     56 {
     57   return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
     58 }
     59 
     60 extern __inline __m256i
     61 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     62 _mm256_abs_epi32 (__m256i __A)
     63 {
     64   return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
     65 }
     66 
     67 extern __inline __m256i
     68 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     69 _mm256_packs_epi32 (__m256i __A, __m256i __B)
     70 {
     71   return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
     72 }
     73 
     74 extern __inline __m256i
     75 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     76 _mm256_packs_epi16 (__m256i __A, __m256i __B)
     77 {
     78   return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
     79 }
     80 
     81 extern __inline __m256i
     82 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     83 _mm256_packus_epi32 (__m256i __A, __m256i __B)
     84 {
     85   return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
     86 }
     87 
     88 extern __inline __m256i
     89 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     90 _mm256_packus_epi16 (__m256i __A, __m256i __B)
     91 {
     92   return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
     93 }
     94 
     95 extern __inline __m256i
     96 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     97 _mm256_add_epi8 (__m256i __A, __m256i __B)
     98 {
     99   return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
    100 }
    101 
    102 extern __inline __m256i
    103 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    104 _mm256_add_epi16 (__m256i __A, __m256i __B)
    105 {
    106   return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
    107 }
    108 
    109 extern __inline __m256i
    110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    111 _mm256_add_epi32 (__m256i __A, __m256i __B)
    112 {
    113   return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
    114 }
    115 
    116 extern __inline __m256i
    117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    118 _mm256_add_epi64 (__m256i __A, __m256i __B)
    119 {
    120   return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
    121 }
    122 
    123 extern __inline __m256i
    124 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    125 _mm256_adds_epi8 (__m256i __A, __m256i __B)
    126 {
    127   return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
    128 }
    129 
    130 extern __inline __m256i
    131 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    132 _mm256_adds_epi16 (__m256i __A, __m256i __B)
    133 {
    134   return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
    135 }
    136 
    137 extern __inline __m256i
    138 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    139 _mm256_adds_epu8 (__m256i __A, __m256i __B)
    140 {
    141   return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
    142 }
    143 
    144 extern __inline __m256i
    145 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    146 _mm256_adds_epu16 (__m256i __A, __m256i __B)
    147 {
    148   return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
    149 }
    150 
    151 #ifdef __OPTIMIZE__
    152 extern __inline __m256i
    153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    154 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
    155 {
    156   return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
    157 					      (__v4di)__B,
    158 					      __N * 8);
    159 }
    160 #else
    161 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
    162 /* Use define instead */
    163 #define _mm256_alignr_epi8(A, B, N)				   \
    164   ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),	   \
    165 					(__v4di)(__m256i)(B),	   \
    166 					(int)(N) * 8))
    167 #endif
    168 
    169 extern __inline __m256i
    170 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    171 _mm256_and_si256 (__m256i __A, __m256i __B)
    172 {
    173   return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
    174 }
    175 
    176 extern __inline __m256i
    177 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    178 _mm256_andnot_si256 (__m256i __A, __m256i __B)
    179 {
    180   return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
    181 }
    182 
    183 extern __inline __m256i
    184 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    185 _mm256_avg_epu8 (__m256i __A, __m256i __B)
    186 {
    187   return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
    188 }
    189 
    190 extern __inline __m256i
    191 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    192 _mm256_avg_epu16 (__m256i __A, __m256i __B)
    193 {
    194   return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
    195 }
    196 
    197 extern __inline __m256i
    198 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    199 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
    200 {
    201   return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
    202 					       (__v32qi)__Y,
    203 					       (__v32qi)__M);
    204 }
    205 
    206 #ifdef __OPTIMIZE__
    207 extern __inline __m256i
    208 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    209 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
    210 {
    211   return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
    212 					      (__v16hi)__Y,
    213 					       __M);
    214 }
    215 #else
    216 #define _mm256_blend_epi16(X, Y, M)					\
    217   ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),		\
    218 					(__v16hi)(__m256i)(Y), (int)(M)))
    219 #endif
    220 
    221 extern __inline __m256i
    222 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    223 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
    224 {
    225   return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
    226 }
    227 
    228 extern __inline __m256i
    229 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    230 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
    231 {
    232   return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
    233 }
    234 
    235 extern __inline __m256i
    236 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    237 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
    238 {
    239   return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
    240 }
    241 
    242 extern __inline __m256i
    243 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    244 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
    245 {
    246   return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
    247 }
    248 
    249 extern __inline __m256i
    250 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    251 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
    252 {
    253   return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
    254 					     (__v32qi)__B);
    255 }
    256 
    257 extern __inline __m256i
    258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    259 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
    260 {
    261   return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
    262 					     (__v16hi)__B);
    263 }
    264 
    265 extern __inline __m256i
    266 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    267 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
    268 {
    269   return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
    270 					     (__v8si)__B);
    271 }
    272 
    273 extern __inline __m256i
    274 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    275 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
    276 {
    277   return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
    278 }
    279 
    280 extern __inline __m256i
    281 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    282 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
    283 {
    284   return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
    285 					     (__v16hi)__Y);
    286 }
    287 
    288 extern __inline __m256i
    289 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    290 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
    291 {
    292   return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
    293 }
    294 
    295 extern __inline __m256i
    296 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    297 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
    298 {
    299   return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
    300 					      (__v16hi)__Y);
    301 }
    302 
    303 extern __inline __m256i
    304 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    305 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
    306 {
    307   return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
    308 					     (__v16hi)__Y);
    309 }
    310 
    311 extern __inline __m256i
    312 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    313 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
    314 {
    315   return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
    316 }
    317 
    318 extern __inline __m256i
    319 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    320 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
    321 {
    322   return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
    323 					      (__v16hi)__Y);
    324 }
    325 
    326 extern __inline __m256i
    327 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    328 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
    329 {
    330   return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
    331 						(__v32qi)__Y);
    332 }
    333 
    334 extern __inline __m256i
    335 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    336 _mm256_madd_epi16 (__m256i __A, __m256i __B)
    337 {
    338   return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
    339 					     (__v16hi)__B);
    340 }
    341 
    342 extern __inline __m256i
    343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    344 _mm256_max_epi8 (__m256i __A, __m256i __B)
    345 {
    346   return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
    347 }
    348 
    349 extern __inline __m256i
    350 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    351 _mm256_max_epi16 (__m256i __A, __m256i __B)
    352 {
    353   return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
    354 }
    355 
    356 extern __inline __m256i
    357 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    358 _mm256_max_epi32 (__m256i __A, __m256i __B)
    359 {
    360   return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
    361 }
    362 
    363 extern __inline __m256i
    364 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    365 _mm256_max_epu8 (__m256i __A, __m256i __B)
    366 {
    367   return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
    368 }
    369 
    370 extern __inline __m256i
    371 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    372 _mm256_max_epu16 (__m256i __A, __m256i __B)
    373 {
    374   return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
    375 }
    376 
    377 extern __inline __m256i
    378 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    379 _mm256_max_epu32 (__m256i __A, __m256i __B)
    380 {
    381   return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
    382 }
    383 
    384 extern __inline __m256i
    385 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    386 _mm256_min_epi8 (__m256i __A, __m256i __B)
    387 {
    388   return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
    389 }
    390 
    391 extern __inline __m256i
    392 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    393 _mm256_min_epi16 (__m256i __A, __m256i __B)
    394 {
    395   return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
    396 }
    397 
    398 extern __inline __m256i
    399 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    400 _mm256_min_epi32 (__m256i __A, __m256i __B)
    401 {
    402   return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
    403 }
    404 
    405 extern __inline __m256i
    406 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    407 _mm256_min_epu8 (__m256i __A, __m256i __B)
    408 {
    409   return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
    410 }
    411 
    412 extern __inline __m256i
    413 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    414 _mm256_min_epu16 (__m256i __A, __m256i __B)
    415 {
    416   return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
    417 }
    418 
    419 extern __inline __m256i
    420 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    421 _mm256_min_epu32 (__m256i __A, __m256i __B)
    422 {
    423   return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
    424 }
    425 
    426 extern __inline int
    427 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    428 _mm256_movemask_epi8 (__m256i __A)
    429 {
    430   return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
    431 }
    432 
    433 extern __inline __m256i
    434 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    435 _mm256_cvtepi8_epi16 (__m128i __X)
    436 {
    437   return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
    438 }
    439 
    440 extern __inline __m256i
    441 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    442 _mm256_cvtepi8_epi32 (__m128i __X)
    443 {
    444   return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
    445 }
    446 
    447 extern __inline __m256i
    448 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    449 _mm256_cvtepi8_epi64 (__m128i __X)
    450 {
    451   return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
    452 }
    453 
    454 extern __inline __m256i
    455 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    456 _mm256_cvtepi16_epi32 (__m128i __X)
    457 {
    458   return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
    459 }
    460 
    461 extern __inline __m256i
    462 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    463 _mm256_cvtepi16_epi64 (__m128i __X)
    464 {
    465   return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
    466 }
    467 
    468 extern __inline __m256i
    469 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    470 _mm256_cvtepi32_epi64 (__m128i __X)
    471 {
    472   return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
    473 }
    474 
    475 extern __inline __m256i
    476 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    477 _mm256_cvtepu8_epi16 (__m128i __X)
    478 {
    479   return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
    480 }
    481 
    482 extern __inline __m256i
    483 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    484 _mm256_cvtepu8_epi32 (__m128i __X)
    485 {
    486   return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
    487 }
    488 
    489 extern __inline __m256i
    490 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    491 _mm256_cvtepu8_epi64 (__m128i __X)
    492 {
    493   return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
    494 }
    495 
    496 extern __inline __m256i
    497 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    498 _mm256_cvtepu16_epi32 (__m128i __X)
    499 {
    500   return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
    501 }
    502 
    503 extern __inline __m256i
    504 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    505 _mm256_cvtepu16_epi64 (__m128i __X)
    506 {
    507   return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
    508 }
    509 
    510 extern __inline __m256i
    511 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    512 _mm256_cvtepu32_epi64 (__m128i __X)
    513 {
    514   return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
    515 }
    516 
    517 extern __inline __m256i
    518 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    519 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
    520 {
    521   return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
    522 }
    523 
    524 extern __inline __m256i
    525 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    526 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
    527 {
    528   return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
    529 					       (__v16hi)__Y);
    530 }
    531 
    532 extern __inline __m256i
    533 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    534 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
    535 {
    536   return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
    537 }
    538 
    539 extern __inline __m256i
    540 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    541 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
    542 {
    543   return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
    544 }
    545 
    546 extern __inline __m256i
    547 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    548 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
    549 {
    550   return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
    551 }
    552 
    553 extern __inline __m256i
    554 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    555 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
    556 {
    557   return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
    558 }
    559 
    560 extern __inline __m256i
    561 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    562 _mm256_mul_epu32 (__m256i __A, __m256i __B)
    563 {
    564   return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
    565 }
    566 
    567 extern __inline __m256i
    568 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    569 _mm256_or_si256 (__m256i __A, __m256i __B)
    570 {
    571   return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
    572 }
    573 
    574 extern __inline __m256i
    575 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    576 _mm256_sad_epu8 (__m256i __A, __m256i __B)
    577 {
    578   return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
    579 }
    580 
    581 extern __inline __m256i
    582 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    583 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
    584 {
    585   return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
    586 					     (__v32qi)__Y);
    587 }
    588 
    589 #ifdef __OPTIMIZE__
    590 extern __inline __m256i
    591 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    592 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
    593 {
    594   return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
    595 }
    596 
    597 extern __inline __m256i
    598 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    599 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
    600 {
    601   return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
    602 }
    603 
    604 extern __inline __m256i
    605 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    606 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
    607 {
    608   return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
    609 }
    610 #else
    611 #define _mm256_shuffle_epi32(A, N) \
    612   ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
    613 #define _mm256_shufflehi_epi16(A, N) \
    614   ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
    615 #define _mm256_shufflelo_epi16(A, N) \
    616   ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
    617 #endif
    618 
    619 extern __inline __m256i
    620 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    621 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
    622 {
    623   return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
    624 }
    625 
    626 extern __inline __m256i
    627 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    628 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
    629 {
    630   return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
    631 }
    632 
    633 extern __inline __m256i
    634 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    635 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
    636 {
    637   return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
    638 }
    639 
    640 #ifdef __OPTIMIZE__
    641 extern __inline __m256i
    642 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    643 _mm256_slli_si256 (__m256i __A, const int __N)
    644 {
    645   return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
    646 }
    647 #else
    648 #define _mm256_slli_si256(A, N) \
    649   ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
    650 #endif
    651 
    652 extern __inline __m256i
    653 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    654 _mm256_slli_epi16 (__m256i __A, int __B)
    655 {
    656   return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
    657 }
    658 
    659 extern __inline __m256i
    660 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    661 _mm256_sll_epi16 (__m256i __A, __m128i __B)
    662 {
    663   return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
    664 }
    665 
    666 extern __inline __m256i
    667 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    668 _mm256_slli_epi32 (__m256i __A, int __B)
    669 {
    670   return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
    671 }
    672 
    673 extern __inline __m256i
    674 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    675 _mm256_sll_epi32 (__m256i __A, __m128i __B)
    676 {
    677   return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
    678 }
    679 
    680 extern __inline __m256i
    681 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    682 _mm256_slli_epi64 (__m256i __A, int __B)
    683 {
    684   return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
    685 }
    686 
    687 extern __inline __m256i
    688 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    689 _mm256_sll_epi64 (__m256i __A, __m128i __B)
    690 {
    691   return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
    692 }
    693 
    694 extern __inline __m256i
    695 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    696 _mm256_srai_epi16 (__m256i __A, int __B)
    697 {
    698   return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
    699 }
    700 
    701 extern __inline __m256i
    702 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    703 _mm256_sra_epi16 (__m256i __A, __m128i __B)
    704 {
    705   return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
    706 }
    707 
    708 extern __inline __m256i
    709 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    710 _mm256_srai_epi32 (__m256i __A, int __B)
    711 {
    712   return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
    713 }
    714 
    715 extern __inline __m256i
    716 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    717 _mm256_sra_epi32 (__m256i __A, __m128i __B)
    718 {
    719   return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
    720 }
    721 
    722 #ifdef __OPTIMIZE__
    723 extern __inline __m256i
    724 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    725 _mm256_srli_si256 (__m256i __A, const int __N)
    726 {
    727   return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
    728 }
    729 #else
    730 #define _mm256_srli_si256(A, N) \
    731   ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
    732 #endif
    733 
    734 extern __inline __m256i
    735 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    736 _mm256_srli_epi16 (__m256i __A, int __B)
    737 {
    738   return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
    739 }
    740 
    741 extern __inline __m256i
    742 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    743 _mm256_srl_epi16 (__m256i __A, __m128i __B)
    744 {
    745   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
    746 }
    747 
    748 extern __inline __m256i
    749 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    750 _mm256_srli_epi32 (__m256i __A, int __B)
    751 {
    752   return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
    753 }
    754 
    755 extern __inline __m256i
    756 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    757 _mm256_srl_epi32 (__m256i __A, __m128i __B)
    758 {
    759   return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
    760 }
    761 
    762 extern __inline __m256i
    763 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    764 _mm256_srli_epi64 (__m256i __A, int __B)
    765 {
    766   return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
    767 }
    768 
    769 extern __inline __m256i
    770 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    771 _mm256_srl_epi64 (__m256i __A, __m128i __B)
    772 {
    773   return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
    774 }
    775 
    776 extern __inline __m256i
    777 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    778 _mm256_sub_epi8 (__m256i __A, __m256i __B)
    779 {
    780   return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
    781 }
    782 
    783 extern __inline __m256i
    784 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    785 _mm256_sub_epi16 (__m256i __A, __m256i __B)
    786 {
    787   return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
    788 }
    789 
    790 extern __inline __m256i
    791 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    792 _mm256_sub_epi32 (__m256i __A, __m256i __B)
    793 {
    794   return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
    795 }
    796 
    797 extern __inline __m256i
    798 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    799 _mm256_sub_epi64 (__m256i __A, __m256i __B)
    800 {
    801   return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
    802 }
    803 
    804 extern __inline __m256i
    805 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    806 _mm256_subs_epi8 (__m256i __A, __m256i __B)
    807 {
    808   return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
    809 }
    810 
    811 extern __inline __m256i
    812 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    813 _mm256_subs_epi16 (__m256i __A, __m256i __B)
    814 {
    815   return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
    816 }
    817 
    818 extern __inline __m256i
    819 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    820 _mm256_subs_epu8 (__m256i __A, __m256i __B)
    821 {
    822   return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
    823 }
    824 
    825 extern __inline __m256i
    826 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    827 _mm256_subs_epu16 (__m256i __A, __m256i __B)
    828 {
    829   return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
    830 }
    831 
    832 extern __inline __m256i
    833 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    834 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
    835 {
    836   return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
    837 }
    838 
    839 extern __inline __m256i
    840 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    841 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
    842 {
    843   return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
    844 }
    845 
    846 extern __inline __m256i
    847 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    848 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
    849 {
    850   return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
    851 }
    852 
    853 extern __inline __m256i
    854 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    855 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
    856 {
    857   return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
    858 }
    859 
    860 extern __inline __m256i
    861 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    862 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
    863 {
    864   return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
    865 }
    866 
    867 extern __inline __m256i
    868 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    869 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
    870 {
    871   return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
    872 }
    873 
    874 extern __inline __m256i
    875 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    876 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
    877 {
    878   return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
    879 }
    880 
    881 extern __inline __m256i
    882 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    883 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
    884 {
    885   return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
    886 }
    887 
    888 extern __inline __m256i
    889 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    890 _mm256_xor_si256 (__m256i __A, __m256i __B)
    891 {
    892   return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
    893 }
    894 
    895 extern __inline __m256i
    896 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    897 _mm256_stream_load_si256 (__m256i const *__X)
    898 {
    899   return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
    900 }
    901 
    902 extern __inline __m128
    903 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    904 _mm_broadcastss_ps (__m128 __X)
    905 {
    906   return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
    907 }
    908 
    909 extern __inline __m256
    910 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    911 _mm256_broadcastss_ps (__m128 __X)
    912 {
    913   return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
    914 }
    915 
    916 extern __inline __m256d
    917 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    918 _mm256_broadcastsd_pd (__m128d __X)
    919 {
    920   return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
    921 }
    922 
    923 extern __inline __m256i
    924 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    925 _mm_broadcastsi128_si256 (__m128i __X)
    926 {
    927   return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
    928 }
    929 
    930 #ifdef __OPTIMIZE__
    931 extern __inline __m128i
    932 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    933 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
    934 {
    935   return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
    936 					      (__v4si)__Y,
    937 					      __M);
    938 }
    939 #else
    940 #define _mm_blend_epi32(X, Y, M)					\
    941   ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),		\
    942 					(__v4si)(__m128i)(Y), (int)(M)))
    943 #endif
    944 
    945 #ifdef __OPTIMIZE__
    946 extern __inline __m256i
    947 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    948 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
    949 {
    950   return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
    951 					      (__v8si)__Y,
    952 					      __M);
    953 }
    954 #else
    955 #define _mm256_blend_epi32(X, Y, M)					\
    956   ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),		\
    957 					(__v8si)(__m256i)(Y), (int)(M)))
    958 #endif
    959 
    960 extern __inline __m256i
    961 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    962 _mm256_broadcastb_epi8 (__m128i __X)
    963 {
    964   return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
    965 }
    966 
    967 extern __inline __m256i
    968 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    969 _mm256_broadcastw_epi16 (__m128i __X)
    970 {
    971   return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
    972 }
    973 
    974 extern __inline __m256i
    975 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    976 _mm256_broadcastd_epi32 (__m128i __X)
    977 {
    978   return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
    979 }
    980 
    981 extern __inline __m256i
    982 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    983 _mm256_broadcastq_epi64 (__m128i __X)
    984 {
    985   return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
    986 }
    987 
    988 extern __inline __m128i
    989 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    990 _mm_broadcastb_epi8 (__m128i __X)
    991 {
    992   return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
    993 }
    994 
    995 extern __inline __m128i
    996 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    997 _mm_broadcastw_epi16 (__m128i __X)
    998 {
    999   return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
   1000 }
   1001 
   1002 extern __inline __m128i
   1003 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1004 _mm_broadcastd_epi32 (__m128i __X)
   1005 {
   1006   return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
   1007 }
   1008 
   1009 extern __inline __m128i
   1010 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1011 _mm_broadcastq_epi64 (__m128i __X)
   1012 {
   1013   return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
   1014 }
   1015 
   1016 extern __inline __m256i
   1017 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1018 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
   1019 {
   1020   return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
   1021 }
   1022 
   1023 #ifdef __OPTIMIZE__
   1024 extern __inline __m256d
   1025 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1026 _mm256_permute4x64_pd (__m256d __X, const int __M)
   1027 {
   1028   return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
   1029 }
   1030 #else
   1031 #define _mm256_permute4x64_pd(X, M)			       \
   1032   ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
   1033 #endif
   1034 
   1035 extern __inline __m256
   1036 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1037 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
   1038 {
   1039   return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
   1040 }
   1041 
   1042 #ifdef __OPTIMIZE__
   1043 extern __inline __m256i
   1044 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1045 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
   1046 {
   1047   return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
   1048 }
   1049 #else
   1050 #define _mm256_permute4x64_epi64(X, M)			       \
   1051   ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
   1052 #endif
   1053 
   1054 
   1055 #ifdef __OPTIMIZE__
   1056 extern __inline __m256i
   1057 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1058 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
   1059 {
   1060   return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
   1061 }
   1062 #else
   1063 #define _mm256_permute2x128_si256(X, Y, M)				\
   1064   ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
   1065 #endif
   1066 
   1067 #ifdef __OPTIMIZE__
   1068 extern __inline __m128i
   1069 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1070 _mm256_extracti128_si256 (__m256i __X, const int __M)
   1071 {
   1072   return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
   1073 }
   1074 #else
   1075 #define _mm256_extracti128_si256(X, M)				\
   1076   ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
   1077 #endif
   1078 
   1079 #ifdef __OPTIMIZE__
   1080 extern __inline __m256i
   1081 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1082 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
   1083 {
   1084   return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
   1085 }
   1086 #else
   1087 #define _mm256_inserti128_si256(X, Y, M)			 \
   1088   ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
   1089 					   (__v2di)(__m128i)(Y), \
   1090 					   (int)(M)))
   1091 #endif
   1092 
   1093 extern __inline __m256i
   1094 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1095 _mm256_maskload_epi32 (int const *__X, __m256i __M )
   1096 {
   1097   return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
   1098 						(__v8si)__M);
   1099 }
   1100 
   1101 extern __inline __m256i
   1102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1103 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
   1104 {
   1105   return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
   1106 						(__v4di)__M);
   1107 }
   1108 
   1109 extern __inline __m128i
   1110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1111 _mm_maskload_epi32 (int const *__X, __m128i __M )
   1112 {
   1113   return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
   1114 					     (__v4si)__M);
   1115 }
   1116 
   1117 extern __inline __m128i
   1118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1119 _mm_maskload_epi64 (long long const *__X, __m128i __M )
   1120 {
   1121   return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
   1122 					     (__v2di)__M);
   1123 }
   1124 
   1125 extern __inline void
   1126 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1127 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
   1128 {
   1129   __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
   1130 }
   1131 
   1132 extern __inline void
   1133 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1134 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
   1135 {
   1136   __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
   1137 }
   1138 
   1139 extern __inline void
   1140 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1141 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
   1142 {
   1143   __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
   1144 }
   1145 
   1146 extern __inline void
   1147 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1148 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
   1149 {
   1150   __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
   1151 }
   1152 
   1153 extern __inline __m256i
   1154 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1155 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
   1156 {
   1157   return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
   1158 }
   1159 
   1160 extern __inline __m128i
   1161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1162 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
   1163 {
   1164   return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
   1165 }
   1166 
   1167 extern __inline __m256i
   1168 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1169 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
   1170 {
   1171   return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
   1172 }
   1173 
   1174 extern __inline __m128i
   1175 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1176 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
   1177 {
   1178   return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
   1179 }
   1180 
   1181 extern __inline __m256i
   1182 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1183 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
   1184 {
   1185   return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
   1186 }
   1187 
   1188 extern __inline __m128i
   1189 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1190 _mm_srav_epi32 (__m128i __X, __m128i __Y)
   1191 {
   1192   return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
   1193 }
   1194 
   1195 extern __inline __m256i
   1196 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1197 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
   1198 {
   1199   return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
   1200 }
   1201 
   1202 extern __inline __m128i
   1203 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1204 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
   1205 {
   1206   return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
   1207 }
   1208 
   1209 extern __inline __m256i
   1210 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1211 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
   1212 {
   1213   return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
   1214 }
   1215 
   1216 extern __inline __m128i
   1217 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1218 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
   1219 {
   1220   return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
   1221 }
   1222 
   1223 #ifdef __OPTIMIZE__
   1224 extern __inline __m128d
   1225 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1226 _mm_i32gather_pd (double const *base, __m128i index, const int scale)
   1227 {
   1228   __v2df src = _mm_setzero_pd ();
   1229   __v2df mask = _mm_cmpeq_pd (src, src);
   1230 
   1231   return (__m128d) __builtin_ia32_gathersiv2df (src,
   1232 						base,
   1233 						(__v4si)index,
   1234 						mask,
   1235 						scale);
   1236 }
   1237 
   1238 extern __inline __m128d
   1239 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1240 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
   1241 		       __m128d mask, const int scale)
   1242 {
   1243   return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
   1244 						base,
   1245 						(__v4si)index,
   1246 						(__v2df)mask,
   1247 						scale);
   1248 }
   1249 
   1250 extern __inline __m256d
   1251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1252 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
   1253 {
   1254   __v4df src = _mm256_setzero_pd ();
   1255   __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
   1256 
   1257   return (__m256d) __builtin_ia32_gathersiv4df (src,
   1258 						base,
   1259 						(__v4si)index,
   1260 						mask,
   1261 						scale);
   1262 }
   1263 
   1264 extern __inline __m256d
   1265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1266 _mm256_mask_i32gather_pd (__m256d src, double const *base,
   1267 			  __m128i index, __m256d mask, const int scale)
   1268 {
   1269   return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
   1270 						base,
   1271 						(__v4si)index,
   1272 						(__v4df)mask,
   1273 						scale);
   1274 }
   1275 
   1276 extern __inline __m128d
   1277 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1278 _mm_i64gather_pd (double const *base, __m128i index, const int scale)
   1279 {
   1280   __v2df src = _mm_setzero_pd ();
   1281   __v2df mask = _mm_cmpeq_pd (src, src);
   1282 
   1283   return (__m128d) __builtin_ia32_gatherdiv2df (src,
   1284 						base,
   1285 						(__v2di)index,
   1286 						mask,
   1287 						scale);
   1288 }
   1289 
   1290 extern __inline __m128d
   1291 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1292 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
   1293 		       __m128d mask, const int scale)
   1294 {
   1295   return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
   1296 						base,
   1297 						(__v2di)index,
   1298 						(__v2df)mask,
   1299 						scale);
   1300 }
   1301 
   1302 extern __inline __m256d
   1303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1304 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
   1305 {
   1306   __v4df src = _mm256_setzero_pd ();
   1307   __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
   1308 
   1309   return (__m256d) __builtin_ia32_gatherdiv4df (src,
   1310 						base,
   1311 						(__v4di)index,
   1312 						mask,
   1313 						scale);
   1314 }
   1315 
   1316 extern __inline __m256d
   1317 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1318 _mm256_mask_i64gather_pd (__m256d src, double const *base,
   1319 			  __m256i index, __m256d mask, const int scale)
   1320 {
   1321   return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
   1322 						base,
   1323 						(__v4di)index,
   1324 						(__v4df)mask,
   1325 						scale);
   1326 }
   1327 
   1328 extern __inline __m128
   1329 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1330 _mm_i32gather_ps (float const *base, __m128i index, const int scale)
   1331 {
   1332   __v4sf src = _mm_setzero_ps ();
   1333   __v4sf mask = _mm_cmpeq_ps (src, src);
   1334 
   1335   return (__m128) __builtin_ia32_gathersiv4sf (src,
   1336 					       base,
   1337 					       (__v4si)index,
   1338 					       mask,
   1339 					       scale);
   1340 }
   1341 
   1342 extern __inline __m128
   1343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1344 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
   1345 		       __m128 mask, const int scale)
   1346 {
   1347   return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
   1348 					       base,
   1349 					       (__v4si)index,
   1350 					       (__v4sf)mask,
   1351 					       scale);
   1352 }
   1353 
   1354 extern __inline __m256
   1355 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1356 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
   1357 {
   1358   __v8sf src = _mm256_setzero_ps ();
   1359   __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
   1360 
   1361   return (__m256) __builtin_ia32_gathersiv8sf (src,
   1362 					       base,
   1363 					       (__v8si)index,
   1364 					       mask,
   1365 					       scale);
   1366 }
   1367 
   1368 extern __inline __m256
   1369 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1370 _mm256_mask_i32gather_ps (__m256 src, float const *base,
   1371 			  __m256i index, __m256 mask, const int scale)
   1372 {
   1373   return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
   1374 					       base,
   1375 					       (__v8si)index,
   1376 					       (__v8sf)mask,
   1377 					       scale);
   1378 }
   1379 
   1380 extern __inline __m128
   1381 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1382 _mm_i64gather_ps (float const *base, __m128i index, const int scale)
   1383 {
   1384   __v4sf src = _mm_setzero_ps ();
   1385   __v4sf mask = _mm_cmpeq_ps (src, src);
   1386 
   1387   return (__m128) __builtin_ia32_gatherdiv4sf (src,
   1388 					       base,
   1389 					       (__v2di)index,
   1390 					       mask,
   1391 					       scale);
   1392 }
   1393 
   1394 extern __inline __m128
   1395 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1396 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
   1397 		       __m128 mask, const int scale)
   1398 {
   1399   return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
   1400 						base,
   1401 						(__v2di)index,
   1402 						(__v4sf)mask,
   1403 						scale);
   1404 }
   1405 
   1406 extern __inline __m128
   1407 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1408 _mm256_i64gather_ps (float const *base, __m256i index, const int scale)
   1409 {
   1410   __v4sf src = _mm_setzero_ps ();
   1411   __v4sf mask = _mm_cmpeq_ps (src, src);
   1412 
   1413   return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
   1414 						  base,
   1415 						  (__v4di)index,
   1416 						  mask,
   1417 						  scale);
   1418 }
   1419 
   1420 extern __inline __m128
   1421 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1422 _mm256_mask_i64gather_ps (__m128 src, float const *base,
   1423 			  __m256i index, __m128 mask, const int scale)
   1424 {
   1425   return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
   1426 						  base,
   1427 						  (__v4di)index,
   1428 						  (__v4sf)mask,
   1429 						  scale);
   1430 }
   1431 
   1432 extern __inline __m128i
   1433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1434 _mm_i32gather_epi64 (long long int const *base,
   1435 		     __m128i index, const int scale)
   1436 {
   1437   __v2di src = __extension__ (__v2di){ 0, 0 };
   1438   __v2di mask = __extension__ (__v2di){ ~0, ~0 };
   1439 
   1440   return (__m128i) __builtin_ia32_gathersiv2di (src,
   1441 						base,
   1442 						(__v4si)index,
   1443 						mask,
   1444 						scale);
   1445 }
   1446 
   1447 extern __inline __m128i
   1448 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1449 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
   1450 			  __m128i index, __m128i mask, const int scale)
   1451 {
   1452   return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
   1453 						base,
   1454 						(__v4si)index,
   1455 						(__v2di)mask,
   1456 						scale);
   1457 }
   1458 
   1459 extern __inline __m256i
   1460 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1461 _mm256_i32gather_epi64 (long long int const *base,
   1462 			__m128i index, const int scale)
   1463 {
   1464   __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
   1465   __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
   1466 
   1467   return (__m256i) __builtin_ia32_gathersiv4di (src,
   1468 						base,
   1469 						(__v4si)index,
   1470 						mask,
   1471 						scale);
   1472 }
   1473 
   1474 extern __inline __m256i
   1475 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1476 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
   1477 			     __m128i index, __m256i mask, const int scale)
   1478 {
   1479   return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
   1480 						base,
   1481 						(__v4si)index,
   1482 						(__v4di)mask,
   1483 						scale);
   1484 }
   1485 
   1486 extern __inline __m128i
   1487 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1488 _mm_i64gather_epi64 (long long int const *base,
   1489 		     __m128i index, const int scale)
   1490 {
   1491   __v2di src = __extension__ (__v2di){ 0, 0 };
   1492   __v2di mask = __extension__ (__v2di){ ~0, ~0 };
   1493 
   1494   return (__m128i) __builtin_ia32_gatherdiv2di (src,
   1495 						base,
   1496 						(__v2di)index,
   1497 						mask,
   1498 						scale);
   1499 }
   1500 
   1501 extern __inline __m128i
   1502 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1503 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
   1504 			  __m128i mask, const int scale)
   1505 {
   1506   return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
   1507 						base,
   1508 						(__v2di)index,
   1509 						(__v2di)mask,
   1510 						scale);
   1511 }
   1512 
   1513 extern __inline __m256i
   1514 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1515 _mm256_i64gather_epi64 (long long int const *base,
   1516 			__m256i index, const int scale)
   1517 {
   1518   __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
   1519   __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
   1520 
   1521   return (__m256i) __builtin_ia32_gatherdiv4di (src,
   1522 						base,
   1523 						(__v4di)index,
   1524 						mask,
   1525 						scale);
   1526 }
   1527 
   1528 extern __inline __m256i
   1529 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1530 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
   1531 			     __m256i index, __m256i mask, const int scale)
   1532 {
   1533   return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
   1534 						base,
   1535 						(__v4di)index,
   1536 						(__v4di)mask,
   1537 						scale);
   1538 }
   1539 
   1540 extern __inline __m128i
   1541 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1542 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
   1543 {
   1544   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
   1545   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
   1546 
   1547   return (__m128i) __builtin_ia32_gathersiv4si (src,
   1548 					       base,
   1549 					       (__v4si)index,
   1550 					       mask,
   1551 					       scale);
   1552 }
   1553 
   1554 extern __inline __m128i
   1555 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1556 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
   1557 			  __m128i mask, const int scale)
   1558 {
   1559   return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
   1560 						base,
   1561 						(__v4si)index,
   1562 						(__v4si)mask,
   1563 						scale);
   1564 }
   1565 
   1566 extern __inline __m256i
   1567 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1568 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
   1569 {
   1570   __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
   1571   __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
   1572 
   1573   return (__m256i) __builtin_ia32_gathersiv8si (src,
   1574 						base,
   1575 						(__v8si)index,
   1576 						mask,
   1577 						scale);
   1578 }
   1579 
   1580 extern __inline __m256i
   1581 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1582 _mm256_mask_i32gather_epi32 (__m256i src, int const *base,
   1583 			     __m256i index, __m256i mask, const int scale)
   1584 {
   1585   return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
   1586 						base,
   1587 						(__v8si)index,
   1588 						(__v8si)mask,
   1589 						scale);
   1590 }
   1591 
   1592 extern __inline __m128i
   1593 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1594 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
   1595 {
   1596   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
   1597   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
   1598 
   1599   return (__m128i) __builtin_ia32_gatherdiv4si (src,
   1600 						base,
   1601 						(__v2di)index,
   1602 						mask,
   1603 						scale);
   1604 }
   1605 
   1606 extern __inline __m128i
   1607 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1608 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
   1609 			  __m128i mask, const int scale)
   1610 {
   1611   return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
   1612 						base,
   1613 						(__v2di)index,
   1614 						(__v4si)mask,
   1615 						scale);
   1616 }
   1617 
   1618 extern __inline __m128i
   1619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1620 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
   1621 {
   1622   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
   1623   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
   1624 
   1625   return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
   1626 						  base,
   1627 						  (__v4di)index,
   1628 						  mask,
   1629 						  scale);
   1630 }
   1631 
   1632 extern __inline __m128i
   1633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1634 _mm256_mask_i64gather_epi32 (__m128i src, int const *base,
   1635 			     __m256i index, __m128i mask, const int scale)
   1636 {
   1637   return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
   1638 						   base,
   1639 						   (__v4di)index,
   1640 						   (__v4si)mask,
   1641 						   scale);
   1642 }
   1643 #else /* __OPTIMIZE__ */
   1644 #define _mm_i32gather_pd(BASE, INDEX, SCALE)				\
   1645   (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),	\
   1646 					 (double const *)BASE,		\
   1647 					 (__v4si)(__m128i)INDEX,	\
   1648 					 (__v2df)_mm_set1_pd(		\
   1649 					   (double)(long long int) -1), \
   1650 					 (int)SCALE)
   1651 
   1652 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1653   (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,	 \
   1654 					 (double const *)BASE,	 \
   1655 					 (__v4si)(__m128i)INDEX, \
   1656 					 (__v2df)(__m128d)MASK,	 \
   1657 					 (int)SCALE)
   1658 
   1659 #define _mm256_i32gather_pd(BASE, INDEX, SCALE)				\
   1660   (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),	\
   1661 					 (double const *)BASE,		\
   1662 					 (__v4si)(__m128i)INDEX,	\
   1663 					 (__v4df)_mm256_set1_pd(	\
   1664 					   (double)(long long int) -1), \
   1665 					 (int)SCALE)
   1666 
   1667 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1668   (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,	 \
   1669 					 (double const *)BASE,	 \
   1670 					 (__v4si)(__m128i)INDEX, \
   1671 					 (__v4df)(__m256d)MASK,	 \
   1672 					 (int)SCALE)
   1673 
   1674 #define _mm_i64gather_pd(BASE, INDEX, SCALE)				\
   1675   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),	\
   1676 					 (double const *)BASE,		\
   1677 					 (__v2di)(__m128i)INDEX,	\
   1678 					 (__v2df)_mm_set1_pd(		\
   1679 					   (double)(long long int) -1), \
   1680 					 (int)SCALE)
   1681 
   1682 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1683   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,	 \
   1684 					 (double const *)BASE,	 \
   1685 					 (__v2di)(__m128i)INDEX, \
   1686 					 (__v2df)(__m128d)MASK,	 \
   1687 					 (int)SCALE)
   1688 
   1689 #define _mm256_i64gather_pd(BASE, INDEX, SCALE)				\
   1690   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),	\
   1691 					 (double const *)BASE,		\
   1692 					 (__v4di)(__m256i)INDEX,	\
   1693 					 (__v4df)_mm256_set1_pd(	\
   1694 					   (double)(long long int) -1), \
   1695 					 (int)SCALE)
   1696 
   1697 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1698   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,	 \
   1699 					 (double const *)BASE,	 \
   1700 					 (__v4di)(__m256i)INDEX, \
   1701 					 (__v4df)(__m256d)MASK,	 \
   1702 					 (int)SCALE)
   1703 
   1704 #define _mm_i32gather_ps(BASE, INDEX, SCALE)				\
   1705   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),	\
   1706 					(float const *)BASE,		\
   1707 					(__v4si)(__m128i)INDEX,		\
   1708 					_mm_set1_ps ((float)(int) -1),	\
   1709 					(int)SCALE)
   1710 
   1711 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
   1712   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,	 \
   1713 					(float const *)BASE,	 \
   1714 					(__v4si)(__m128i)INDEX,	 \
   1715 					(__v4sf)(__m128d)MASK,	 \
   1716 					(int)SCALE)
   1717 
   1718 #define _mm256_i32gather_ps(BASE, INDEX, SCALE)			       \
   1719   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
   1720 					(float const *)BASE,	       \
   1721 					(__v8si)(__m256i)INDEX,	       \
   1722 					(__v8sf)_mm256_set1_ps (       \
   1723 					  (float)(int) -1),	       \
   1724 					(int)SCALE)
   1725 
   1726 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
   1727   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,	\
   1728 					(float const *)BASE,	\
   1729 					(__v8si)(__m256i)INDEX, \
   1730 					(__v8sf)(__m256d)MASK,	\
   1731 					(int)SCALE)
   1732 
   1733 #define _mm_i64gather_ps(BASE, INDEX, SCALE)				\
   1734   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),	\
   1735 					(float const *)BASE,		\
   1736 					(__v2di)(__m128i)INDEX,		\
   1737 					(__v4sf)_mm_set1_ps (		\
   1738 					  (float)(int) -1),		\
   1739 					(int)SCALE)
   1740 
   1741 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
   1742   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,	 \
   1743 					(float const *)BASE,	 \
   1744 					(__v2di)(__m128i)INDEX,	 \
   1745 					(__v4sf)(__m128d)MASK,	 \
   1746 					(int)SCALE)
   1747 
   1748 #define _mm256_i64gather_ps(BASE, INDEX, SCALE)				\
   1749   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (),	\
   1750 					   (float const *)BASE,		\
   1751 					   (__v4di)(__m256i)INDEX,	\
   1752 					   (__v4sf)_mm_set1_ps(		\
   1753 					     (float)(int) -1),		\
   1754 					   (int)SCALE)
   1755 
   1756 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	   \
   1757   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,	   \
   1758 					   (float const *)BASE,	   \
   1759 					   (__v4di)(__m256i)INDEX, \
   1760 					   (__v4sf)(__m128)MASK,   \
   1761 					   (int)SCALE)
   1762 
   1763 #define _mm_i32gather_epi64(BASE, INDEX, SCALE)				\
   1764   (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
   1765 					 (long long const *)BASE,	\
   1766 					 (__v4si)(__m128i)INDEX,	\
   1767 					 (__v2di)_mm_set1_epi64x (-1),	\
   1768 					 (int)SCALE)
   1769 
   1770 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
   1771   (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC,	  \
   1772 					 (long long const *)BASE, \
   1773 					 (__v4si)(__m128i)INDEX,  \
   1774 					 (__v2di)(__m128i)MASK,	  \
   1775 					 (int)SCALE)
   1776 
   1777 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE)			   \
   1778   (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
   1779 					 (long long const *)BASE,	   \
   1780 					 (__v4si)(__m128i)INDEX,	   \
   1781 					 (__v4di)_mm256_set1_epi64x (-1),  \
   1782 					 (int)SCALE)
   1783 
   1784 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
   1785   (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC,	   \
   1786 					 (long long const *)BASE,  \
   1787 					 (__v4si)(__m128i)INDEX,   \
   1788 					 (__v4di)(__m256i)MASK,	   \
   1789 					 (int)SCALE)
   1790 
   1791 #define _mm_i64gather_epi64(BASE, INDEX, SCALE)				\
   1792   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
   1793 					 (long long const *)BASE,	\
   1794 					 (__v2di)(__m128i)INDEX,	\
   1795 					 (__v2di)_mm_set1_epi64x (-1),	\
   1796 					 (int)SCALE)
   1797 
   1798 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
   1799   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC,	  \
   1800 					 (long long const *)BASE, \
   1801 					 (__v2di)(__m128i)INDEX,  \
   1802 					 (__v2di)(__m128i)MASK,	  \
   1803 					 (int)SCALE)
   1804 
   1805 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE)			   \
   1806   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
   1807 					 (long long const *)BASE,	   \
   1808 					 (__v4di)(__m256i)INDEX,	   \
   1809 					 (__v4di)_mm256_set1_epi64x (-1),  \
   1810 					 (int)SCALE)
   1811 
   1812 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
   1813   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC,	   \
   1814 					 (long long const *)BASE,  \
   1815 					 (__v4di)(__m256i)INDEX,   \
   1816 					 (__v4di)(__m256i)MASK,	   \
   1817 					 (int)SCALE)
   1818 
   1819 #define _mm_i32gather_epi32(BASE, INDEX, SCALE)				\
   1820   (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (),	\
   1821 					 (int const *)BASE,		\
   1822 					 (__v4si)(__m128i)INDEX,	\
   1823 					 (__v4si)_mm_set1_epi32 (-1),	\
   1824 					 (int)SCALE)
   1825 
   1826 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1827   (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC,	\
   1828 					(int const *)BASE,	\
   1829 					(__v4si)(__m128i)INDEX, \
   1830 					(__v4si)(__m128i)MASK,	\
   1831 					(int)SCALE)
   1832 
   1833 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE)			   \
   1834   (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
   1835 					 (int const *)BASE,		   \
   1836 					 (__v8si)(__m256i)INDEX,	   \
   1837 					 (__v8si)_mm256_set1_epi32 (-1),   \
   1838 					 (int)SCALE)
   1839 
   1840 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1841   (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC,	   \
   1842 					(int const *)BASE,	   \
   1843 					(__v8si)(__m256i)INDEX,	   \
   1844 					(__v8si)(__m256i)MASK,	   \
   1845 					(int)SCALE)
   1846 
   1847 #define _mm_i64gather_epi32(BASE, INDEX, SCALE)				\
   1848   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (),	\
   1849 					 (int const *)BASE,		\
   1850 					 (__v2di)(__m128i)INDEX,	\
   1851 					 (__v4si)_mm_set1_epi32 (-1),	\
   1852 					 (int)SCALE)
   1853 
   1854 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1855   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC,	\
   1856 					(int const *)BASE,	\
   1857 					(__v2di)(__m128i)INDEX, \
   1858 					(__v4si)(__m128i)MASK,	\
   1859 					(int)SCALE)
   1860 
   1861 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE)			   \
   1862   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
   1863 					    (int const *)BASE,		   \
   1864 					    (__v4di)(__m256i)INDEX,	   \
   1865 					    (__v4si)_mm_set1_epi32(-1),	   \
   1866 					    (int)SCALE)
   1867 
   1868 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1869   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC,  \
   1870 					   (int const *)BASE,	   \
   1871 					   (__v4di)(__m256i)INDEX, \
   1872 					   (__v4si)(__m128i)MASK,  \
   1873 					   (int)SCALE)
   1874 #endif  /* __OPTIMIZE__ */
   1875