Home | History | Annotate | Download | only in include
      1 /* Copyright (C) 2011-2013 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 #ifndef _IMMINTRIN_H_INCLUDED
     25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
     26 #endif
     27 
     28 /* Sum absolute 8-bit integer difference of adjacent groups of 4
     29    byte integers in the first 2 operands.  Starting offsets within
     30    operands are determined by the 3rd mask operand.  */
     31 #ifdef __OPTIMIZE__
     32 extern __inline __m256i
     33 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     34 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
     35 {
     36   return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
     37 					      (__v32qi)__Y, __M);
     38 }
     39 #else
     40 #define _mm256_mpsadbw_epu8(X, Y, M)					\
     41   ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),		\
     42 					(__v32qi)(__m256i)(Y), (int)(M)))
     43 #endif
     44 
     45 extern __inline __m256i
     46 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     47 _mm256_abs_epi8 (__m256i __A)
     48 {
     49   return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
     50 }
     51 
     52 extern __inline __m256i
     53 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     54 _mm256_abs_epi16 (__m256i __A)
     55 {
     56   return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
     57 }
     58 
     59 extern __inline __m256i
     60 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     61 _mm256_abs_epi32 (__m256i __A)
     62 {
     63   return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
     64 }
     65 
     66 extern __inline __m256i
     67 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     68 _mm256_packs_epi32 (__m256i __A, __m256i __B)
     69 {
     70   return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
     71 }
     72 
     73 extern __inline __m256i
     74 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     75 _mm256_packs_epi16 (__m256i __A, __m256i __B)
     76 {
     77   return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
     78 }
     79 
     80 extern __inline __m256i
     81 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     82 _mm256_packus_epi32 (__m256i __A, __m256i __B)
     83 {
     84   return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
     85 }
     86 
     87 extern __inline __m256i
     88 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     89 _mm256_packus_epi16 (__m256i __A, __m256i __B)
     90 {
     91   return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
     92 }
     93 
     94 extern __inline __m256i
     95 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     96 _mm256_add_epi8 (__m256i __A, __m256i __B)
     97 {
     98   return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
     99 }
    100 
    101 extern __inline __m256i
    102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    103 _mm256_add_epi16 (__m256i __A, __m256i __B)
    104 {
    105   return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
    106 }
    107 
    108 extern __inline __m256i
    109 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    110 _mm256_add_epi32 (__m256i __A, __m256i __B)
    111 {
    112   return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
    113 }
    114 
    115 extern __inline __m256i
    116 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    117 _mm256_add_epi64 (__m256i __A, __m256i __B)
    118 {
    119   return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
    120 }
    121 
    122 extern __inline __m256i
    123 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    124 _mm256_adds_epi8 (__m256i __A, __m256i __B)
    125 {
    126   return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
    127 }
    128 
    129 extern __inline __m256i
    130 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    131 _mm256_adds_epi16 (__m256i __A, __m256i __B)
    132 {
    133   return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
    134 }
    135 
    136 extern __inline __m256i
    137 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    138 _mm256_adds_epu8 (__m256i __A, __m256i __B)
    139 {
    140   return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
    141 }
    142 
    143 extern __inline __m256i
    144 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    145 _mm256_adds_epu16 (__m256i __A, __m256i __B)
    146 {
    147   return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
    148 }
    149 
    150 #ifdef __OPTIMIZE__
    151 extern __inline __m256i
    152 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    153 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
    154 {
    155   return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
    156 					      (__v4di)__B,
    157 					      __N * 8);
    158 }
    159 #else
    160 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
    161 /* Use define instead */
    162 #define _mm256_alignr_epi8(A, B, N)				   \
    163   ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),	   \
    164 					(__v4di)(__m256i)(B),	   \
    165 					(int)(N) * 8))
    166 #endif
    167 
    168 extern __inline __m256i
    169 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    170 _mm256_and_si256 (__m256i __A, __m256i __B)
    171 {
    172   return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
    173 }
    174 
    175 extern __inline __m256i
    176 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    177 _mm256_andnot_si256 (__m256i __A, __m256i __B)
    178 {
    179   return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
    180 }
    181 
    182 extern __inline __m256i
    183 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    184 _mm256_avg_epu8 (__m256i __A, __m256i __B)
    185 {
    186   return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
    187 }
    188 
    189 extern __inline __m256i
    190 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    191 _mm256_avg_epu16 (__m256i __A, __m256i __B)
    192 {
    193   return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
    194 }
    195 
    196 extern __inline __m256i
    197 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    198 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
    199 {
    200   return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
    201 					       (__v32qi)__Y,
    202 					       (__v32qi)__M);
    203 }
    204 
    205 #ifdef __OPTIMIZE__
    206 extern __inline __m256i
    207 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    208 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
    209 {
    210   return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
    211 					      (__v16hi)__Y,
    212 					       __M);
    213 }
    214 #else
    215 #define _mm256_blend_epi16(X, Y, M)					\
    216   ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),		\
    217 					(__v16hi)(__m256i)(Y), (int)(M)))
    218 #endif
    219 
    220 extern __inline __m256i
    221 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    222 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
    223 {
    224   return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
    225 }
    226 
    227 extern __inline __m256i
    228 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    229 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
    230 {
    231   return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
    232 }
    233 
    234 extern __inline __m256i
    235 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    236 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
    237 {
    238   return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
    239 }
    240 
    241 extern __inline __m256i
    242 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    243 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
    244 {
    245   return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
    246 }
    247 
    248 extern __inline __m256i
    249 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    250 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
    251 {
    252   return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
    253 					     (__v32qi)__B);
    254 }
    255 
    256 extern __inline __m256i
    257 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    258 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
    259 {
    260   return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
    261 					     (__v16hi)__B);
    262 }
    263 
    264 extern __inline __m256i
    265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    266 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
    267 {
    268   return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
    269 					     (__v8si)__B);
    270 }
    271 
    272 extern __inline __m256i
    273 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    274 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
    275 {
    276   return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
    277 }
    278 
    279 extern __inline __m256i
    280 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    281 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
    282 {
    283   return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
    284 					     (__v16hi)__Y);
    285 }
    286 
    287 extern __inline __m256i
    288 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    289 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
    290 {
    291   return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
    292 }
    293 
    294 extern __inline __m256i
    295 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    296 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
    297 {
    298   return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
    299 					      (__v16hi)__Y);
    300 }
    301 
    302 extern __inline __m256i
    303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    304 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
    305 {
    306   return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
    307 					     (__v16hi)__Y);
    308 }
    309 
    310 extern __inline __m256i
    311 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    312 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
    313 {
    314   return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
    315 }
    316 
    317 extern __inline __m256i
    318 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    319 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
    320 {
    321   return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
    322 					      (__v16hi)__Y);
    323 }
    324 
    325 extern __inline __m256i
    326 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    327 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
    328 {
    329   return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
    330 						(__v32qi)__Y);
    331 }
    332 
    333 extern __inline __m256i
    334 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    335 _mm256_madd_epi16 (__m256i __A, __m256i __B)
    336 {
    337   return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
    338 					     (__v16hi)__B);
    339 }
    340 
    341 extern __inline __m256i
    342 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    343 _mm256_max_epi8 (__m256i __A, __m256i __B)
    344 {
    345   return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
    346 }
    347 
    348 extern __inline __m256i
    349 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    350 _mm256_max_epi16 (__m256i __A, __m256i __B)
    351 {
    352   return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
    353 }
    354 
    355 extern __inline __m256i
    356 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    357 _mm256_max_epi32 (__m256i __A, __m256i __B)
    358 {
    359   return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
    360 }
    361 
    362 extern __inline __m256i
    363 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    364 _mm256_max_epu8 (__m256i __A, __m256i __B)
    365 {
    366   return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
    367 }
    368 
    369 extern __inline __m256i
    370 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    371 _mm256_max_epu16 (__m256i __A, __m256i __B)
    372 {
    373   return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
    374 }
    375 
    376 extern __inline __m256i
    377 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    378 _mm256_max_epu32 (__m256i __A, __m256i __B)
    379 {
    380   return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
    381 }
    382 
    383 extern __inline __m256i
    384 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    385 _mm256_min_epi8 (__m256i __A, __m256i __B)
    386 {
    387   return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
    388 }
    389 
    390 extern __inline __m256i
    391 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    392 _mm256_min_epi16 (__m256i __A, __m256i __B)
    393 {
    394   return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
    395 }
    396 
    397 extern __inline __m256i
    398 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    399 _mm256_min_epi32 (__m256i __A, __m256i __B)
    400 {
    401   return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
    402 }
    403 
    404 extern __inline __m256i
    405 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    406 _mm256_min_epu8 (__m256i __A, __m256i __B)
    407 {
    408   return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
    409 }
    410 
    411 extern __inline __m256i
    412 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    413 _mm256_min_epu16 (__m256i __A, __m256i __B)
    414 {
    415   return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
    416 }
    417 
    418 extern __inline __m256i
    419 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    420 _mm256_min_epu32 (__m256i __A, __m256i __B)
    421 {
    422   return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
    423 }
    424 
    425 extern __inline int
    426 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    427 _mm256_movemask_epi8 (__m256i __A)
    428 {
    429   return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
    430 }
    431 
    432 extern __inline __m256i
    433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    434 _mm256_cvtepi8_epi16 (__m128i __X)
    435 {
    436   return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
    437 }
    438 
    439 extern __inline __m256i
    440 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    441 _mm256_cvtepi8_epi32 (__m128i __X)
    442 {
    443   return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
    444 }
    445 
    446 extern __inline __m256i
    447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    448 _mm256_cvtepi8_epi64 (__m128i __X)
    449 {
    450   return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
    451 }
    452 
    453 extern __inline __m256i
    454 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    455 _mm256_cvtepi16_epi32 (__m128i __X)
    456 {
    457   return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
    458 }
    459 
    460 extern __inline __m256i
    461 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    462 _mm256_cvtepi16_epi64 (__m128i __X)
    463 {
    464   return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
    465 }
    466 
    467 extern __inline __m256i
    468 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    469 _mm256_cvtepi32_epi64 (__m128i __X)
    470 {
    471   return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
    472 }
    473 
    474 extern __inline __m256i
    475 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    476 _mm256_cvtepu8_epi16 (__m128i __X)
    477 {
    478   return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
    479 }
    480 
    481 extern __inline __m256i
    482 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    483 _mm256_cvtepu8_epi32 (__m128i __X)
    484 {
    485   return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
    486 }
    487 
    488 extern __inline __m256i
    489 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    490 _mm256_cvtepu8_epi64 (__m128i __X)
    491 {
    492   return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
    493 }
    494 
    495 extern __inline __m256i
    496 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    497 _mm256_cvtepu16_epi32 (__m128i __X)
    498 {
    499   return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
    500 }
    501 
    502 extern __inline __m256i
    503 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    504 _mm256_cvtepu16_epi64 (__m128i __X)
    505 {
    506   return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
    507 }
    508 
    509 extern __inline __m256i
    510 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    511 _mm256_cvtepu32_epi64 (__m128i __X)
    512 {
    513   return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
    514 }
    515 
    516 extern __inline __m256i
    517 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    518 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
    519 {
    520   return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
    521 }
    522 
    523 extern __inline __m256i
    524 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    525 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
    526 {
    527   return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
    528 					       (__v16hi)__Y);
    529 }
    530 
    531 extern __inline __m256i
    532 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    533 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
    534 {
    535   return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
    536 }
    537 
    538 extern __inline __m256i
    539 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    540 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
    541 {
    542   return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
    543 }
    544 
    545 extern __inline __m256i
    546 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    547 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
    548 {
    549   return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
    550 }
    551 
    552 extern __inline __m256i
    553 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    554 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
    555 {
    556   return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
    557 }
    558 
    559 extern __inline __m256i
    560 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    561 _mm256_mul_epu32 (__m256i __A, __m256i __B)
    562 {
    563   return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
    564 }
    565 
    566 extern __inline __m256i
    567 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    568 _mm256_or_si256 (__m256i __A, __m256i __B)
    569 {
    570   return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
    571 }
    572 
    573 extern __inline __m256i
    574 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    575 _mm256_sad_epu8 (__m256i __A, __m256i __B)
    576 {
    577   return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
    578 }
    579 
    580 extern __inline __m256i
    581 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    582 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
    583 {
    584   return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
    585 					     (__v32qi)__Y);
    586 }
    587 
    588 #ifdef __OPTIMIZE__
    589 extern __inline __m256i
    590 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    591 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
    592 {
    593   return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
    594 }
    595 
    596 extern __inline __m256i
    597 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    598 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
    599 {
    600   return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
    601 }
    602 
    603 extern __inline __m256i
    604 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    605 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
    606 {
    607   return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
    608 }
    609 #else
    610 #define _mm256_shuffle_epi32(A, N) \
    611   ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
    612 #define _mm256_shufflehi_epi16(A, N) \
    613   ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
    614 #define _mm256_shufflelo_epi16(A, N) \
    615   ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
    616 #endif
    617 
    618 extern __inline __m256i
    619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    620 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
    621 {
    622   return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
    623 }
    624 
    625 extern __inline __m256i
    626 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    627 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
    628 {
    629   return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
    630 }
    631 
    632 extern __inline __m256i
    633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    634 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
    635 {
    636   return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
    637 }
    638 
    639 #ifdef __OPTIMIZE__
    640 extern __inline __m256i
    641 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    642 _mm256_slli_si256 (__m256i __A, const int __N)
    643 {
    644   return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
    645 }
    646 #else
    647 #define _mm256_slli_si256(A, N) \
    648   ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
    649 #endif
    650 
    651 extern __inline __m256i
    652 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    653 _mm256_slli_epi16 (__m256i __A, int __B)
    654 {
    655   return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
    656 }
    657 
    658 extern __inline __m256i
    659 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    660 _mm256_sll_epi16 (__m256i __A, __m128i __B)
    661 {
    662   return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
    663 }
    664 
    665 extern __inline __m256i
    666 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    667 _mm256_slli_epi32 (__m256i __A, int __B)
    668 {
    669   return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
    670 }
    671 
    672 extern __inline __m256i
    673 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    674 _mm256_sll_epi32 (__m256i __A, __m128i __B)
    675 {
    676   return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
    677 }
    678 
    679 extern __inline __m256i
    680 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    681 _mm256_slli_epi64 (__m256i __A, int __B)
    682 {
    683   return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
    684 }
    685 
    686 extern __inline __m256i
    687 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    688 _mm256_sll_epi64 (__m256i __A, __m128i __B)
    689 {
    690   return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
    691 }
    692 
    693 extern __inline __m256i
    694 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    695 _mm256_srai_epi16 (__m256i __A, int __B)
    696 {
    697   return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
    698 }
    699 
    700 extern __inline __m256i
    701 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    702 _mm256_sra_epi16 (__m256i __A, __m128i __B)
    703 {
    704   return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
    705 }
    706 
    707 extern __inline __m256i
    708 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    709 _mm256_srai_epi32 (__m256i __A, int __B)
    710 {
    711   return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
    712 }
    713 
    714 extern __inline __m256i
    715 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    716 _mm256_sra_epi32 (__m256i __A, __m128i __B)
    717 {
    718   return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
    719 }
    720 
    721 #ifdef __OPTIMIZE__
    722 extern __inline __m256i
    723 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    724 _mm256_srli_si256 (__m256i __A, const int __N)
    725 {
    726   return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
    727 }
    728 #else
    729 #define _mm256_srli_si256(A, N) \
    730   ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
    731 #endif
    732 
    733 extern __inline __m256i
    734 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    735 _mm256_srli_epi16 (__m256i __A, int __B)
    736 {
    737   return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
    738 }
    739 
    740 extern __inline __m256i
    741 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    742 _mm256_srl_epi16 (__m256i __A, __m128i __B)
    743 {
    744   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
    745 }
    746 
    747 extern __inline __m256i
    748 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    749 _mm256_srli_epi32 (__m256i __A, int __B)
    750 {
    751   return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
    752 }
    753 
    754 extern __inline __m256i
    755 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    756 _mm256_srl_epi32 (__m256i __A, __m128i __B)
    757 {
    758   return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
    759 }
    760 
    761 extern __inline __m256i
    762 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    763 _mm256_srli_epi64 (__m256i __A, int __B)
    764 {
    765   return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
    766 }
    767 
    768 extern __inline __m256i
    769 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    770 _mm256_srl_epi64 (__m256i __A, __m128i __B)
    771 {
    772   return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
    773 }
    774 
    775 extern __inline __m256i
    776 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    777 _mm256_sub_epi8 (__m256i __A, __m256i __B)
    778 {
    779   return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
    780 }
    781 
    782 extern __inline __m256i
    783 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    784 _mm256_sub_epi16 (__m256i __A, __m256i __B)
    785 {
    786   return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
    787 }
    788 
    789 extern __inline __m256i
    790 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    791 _mm256_sub_epi32 (__m256i __A, __m256i __B)
    792 {
    793   return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
    794 }
    795 
    796 extern __inline __m256i
    797 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    798 _mm256_sub_epi64 (__m256i __A, __m256i __B)
    799 {
    800   return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
    801 }
    802 
    803 extern __inline __m256i
    804 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    805 _mm256_subs_epi8 (__m256i __A, __m256i __B)
    806 {
    807   return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
    808 }
    809 
    810 extern __inline __m256i
    811 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    812 _mm256_subs_epi16 (__m256i __A, __m256i __B)
    813 {
    814   return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
    815 }
    816 
    817 extern __inline __m256i
    818 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    819 _mm256_subs_epu8 (__m256i __A, __m256i __B)
    820 {
    821   return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
    822 }
    823 
    824 extern __inline __m256i
    825 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    826 _mm256_subs_epu16 (__m256i __A, __m256i __B)
    827 {
    828   return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
    829 }
    830 
    831 extern __inline __m256i
    832 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    833 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
    834 {
    835   return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
    836 }
    837 
    838 extern __inline __m256i
    839 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    840 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
    841 {
    842   return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
    843 }
    844 
    845 extern __inline __m256i
    846 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    847 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
    848 {
    849   return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
    850 }
    851 
    852 extern __inline __m256i
    853 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    854 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
    855 {
    856   return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
    857 }
    858 
    859 extern __inline __m256i
    860 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    861 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
    862 {
    863   return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
    864 }
    865 
    866 extern __inline __m256i
    867 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    868 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
    869 {
    870   return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
    871 }
    872 
    873 extern __inline __m256i
    874 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    875 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
    876 {
    877   return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
    878 }
    879 
    880 extern __inline __m256i
    881 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    882 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
    883 {
    884   return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
    885 }
    886 
    887 extern __inline __m256i
    888 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    889 _mm256_xor_si256 (__m256i __A, __m256i __B)
    890 {
    891   return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
    892 }
    893 
    894 extern __inline __m256i
    895 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    896 _mm256_stream_load_si256 (__m256i const *__X)
    897 {
    898   return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
    899 }
    900 
    901 extern __inline __m128
    902 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    903 _mm_broadcastss_ps (__m128 __X)
    904 {
    905   return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
    906 }
    907 
    908 extern __inline __m256
    909 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    910 _mm256_broadcastss_ps (__m128 __X)
    911 {
    912   return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
    913 }
    914 
    915 extern __inline __m256d
    916 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    917 _mm256_broadcastsd_pd (__m128d __X)
    918 {
    919   return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
    920 }
    921 
    922 extern __inline __m256i
    923 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    924 _mm256_broadcastsi128_si256 (__m128i __X)
    925 {
    926   return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
    927 }
    928 
    929 #ifdef __OPTIMIZE__
    930 extern __inline __m128i
    931 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    932 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
    933 {
    934   return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
    935 					      (__v4si)__Y,
    936 					      __M);
    937 }
    938 #else
    939 #define _mm_blend_epi32(X, Y, M)					\
    940   ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),		\
    941 					(__v4si)(__m128i)(Y), (int)(M)))
    942 #endif
    943 
    944 #ifdef __OPTIMIZE__
    945 extern __inline __m256i
    946 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    947 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
    948 {
    949   return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
    950 					      (__v8si)__Y,
    951 					      __M);
    952 }
    953 #else
    954 #define _mm256_blend_epi32(X, Y, M)					\
    955   ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),		\
    956 					(__v8si)(__m256i)(Y), (int)(M)))
    957 #endif
    958 
    959 extern __inline __m256i
    960 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    961 _mm256_broadcastb_epi8 (__m128i __X)
    962 {
    963   return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
    964 }
    965 
    966 extern __inline __m256i
    967 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    968 _mm256_broadcastw_epi16 (__m128i __X)
    969 {
    970   return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
    971 }
    972 
    973 extern __inline __m256i
    974 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    975 _mm256_broadcastd_epi32 (__m128i __X)
    976 {
    977   return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
    978 }
    979 
    980 extern __inline __m256i
    981 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    982 _mm256_broadcastq_epi64 (__m128i __X)
    983 {
    984   return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
    985 }
    986 
    987 extern __inline __m128i
    988 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    989 _mm_broadcastb_epi8 (__m128i __X)
    990 {
    991   return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
    992 }
    993 
    994 extern __inline __m128i
    995 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
    996 _mm_broadcastw_epi16 (__m128i __X)
    997 {
    998   return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
    999 }
   1000 
   1001 extern __inline __m128i
   1002 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1003 _mm_broadcastd_epi32 (__m128i __X)
   1004 {
   1005   return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
   1006 }
   1007 
   1008 extern __inline __m128i
   1009 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1010 _mm_broadcastq_epi64 (__m128i __X)
   1011 {
   1012   return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
   1013 }
   1014 
   1015 extern __inline __m256i
   1016 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1017 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
   1018 {
   1019   return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
   1020 }
   1021 
   1022 #ifdef __OPTIMIZE__
   1023 extern __inline __m256d
   1024 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1025 _mm256_permute4x64_pd (__m256d __X, const int __M)
   1026 {
   1027   return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
   1028 }
   1029 #else
   1030 #define _mm256_permute4x64_pd(X, M)			       \
   1031   ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
   1032 #endif
   1033 
   1034 extern __inline __m256
   1035 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1036 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
   1037 {
   1038   return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
   1039 }
   1040 
   1041 #ifdef __OPTIMIZE__
   1042 extern __inline __m256i
   1043 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1044 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
   1045 {
   1046   return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
   1047 }
   1048 #else
   1049 #define _mm256_permute4x64_epi64(X, M)			       \
   1050   ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
   1051 #endif
   1052 
   1053 
   1054 #ifdef __OPTIMIZE__
   1055 extern __inline __m256i
   1056 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1057 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
   1058 {
   1059   return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
   1060 }
   1061 #else
   1062 #define _mm256_permute2x128_si256(X, Y, M)				\
   1063   ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
   1064 #endif
   1065 
   1066 #ifdef __OPTIMIZE__
   1067 extern __inline __m128i
   1068 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1069 _mm256_extracti128_si256 (__m256i __X, const int __M)
   1070 {
   1071   return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
   1072 }
   1073 #else
   1074 #define _mm256_extracti128_si256(X, M)				\
   1075   ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
   1076 #endif
   1077 
   1078 #ifdef __OPTIMIZE__
   1079 extern __inline __m256i
   1080 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1081 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
   1082 {
   1083   return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
   1084 }
   1085 #else
   1086 #define _mm256_inserti128_si256(X, Y, M)			 \
   1087   ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
   1088 					   (__v2di)(__m128i)(Y), \
   1089 					   (int)(M)))
   1090 #endif
   1091 
   1092 extern __inline __m256i
   1093 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1094 _mm256_maskload_epi32 (int const *__X, __m256i __M )
   1095 {
   1096   return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
   1097 						(__v8si)__M);
   1098 }
   1099 
   1100 extern __inline __m256i
   1101 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1102 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
   1103 {
   1104   return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
   1105 						(__v4di)__M);
   1106 }
   1107 
   1108 extern __inline __m128i
   1109 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1110 _mm_maskload_epi32 (int const *__X, __m128i __M )
   1111 {
   1112   return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
   1113 					     (__v4si)__M);
   1114 }
   1115 
   1116 extern __inline __m128i
   1117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1118 _mm_maskload_epi64 (long long const *__X, __m128i __M )
   1119 {
   1120   return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
   1121 					     (__v2di)__M);
   1122 }
   1123 
   1124 extern __inline void
   1125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1126 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
   1127 {
   1128   __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
   1129 }
   1130 
   1131 extern __inline void
   1132 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1133 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
   1134 {
   1135   __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
   1136 }
   1137 
   1138 extern __inline void
   1139 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1140 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
   1141 {
   1142   __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
   1143 }
   1144 
   1145 extern __inline void
   1146 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1147 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
   1148 {
   1149   __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
   1150 }
   1151 
   1152 extern __inline __m256i
   1153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1154 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
   1155 {
   1156   return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
   1157 }
   1158 
   1159 extern __inline __m128i
   1160 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1161 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
   1162 {
   1163   return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
   1164 }
   1165 
   1166 extern __inline __m256i
   1167 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1168 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
   1169 {
   1170   return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
   1171 }
   1172 
   1173 extern __inline __m128i
   1174 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1175 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
   1176 {
   1177   return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
   1178 }
   1179 
   1180 extern __inline __m256i
   1181 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1182 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
   1183 {
   1184   return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
   1185 }
   1186 
   1187 extern __inline __m128i
   1188 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1189 _mm_srav_epi32 (__m128i __X, __m128i __Y)
   1190 {
   1191   return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
   1192 }
   1193 
   1194 extern __inline __m256i
   1195 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1196 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
   1197 {
   1198   return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
   1199 }
   1200 
   1201 extern __inline __m128i
   1202 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1203 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
   1204 {
   1205   return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
   1206 }
   1207 
   1208 extern __inline __m256i
   1209 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1210 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
   1211 {
   1212   return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
   1213 }
   1214 
   1215 extern __inline __m128i
   1216 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1217 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
   1218 {
   1219   return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
   1220 }
   1221 
   1222 #ifdef __OPTIMIZE__
   1223 extern __inline __m128d
   1224 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1225 _mm_i32gather_pd (double const *base, __m128i index, const int scale)
   1226 {
   1227   __v2df src = _mm_setzero_pd ();
   1228   __v2df mask = _mm_cmpeq_pd (src, src);
   1229 
   1230   return (__m128d) __builtin_ia32_gathersiv2df (src,
   1231 						base,
   1232 						(__v4si)index,
   1233 						mask,
   1234 						scale);
   1235 }
   1236 
   1237 extern __inline __m128d
   1238 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1239 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
   1240 		       __m128d mask, const int scale)
   1241 {
   1242   return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
   1243 						base,
   1244 						(__v4si)index,
   1245 						(__v2df)mask,
   1246 						scale);
   1247 }
   1248 
   1249 extern __inline __m256d
   1250 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1251 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
   1252 {
   1253   __v4df src = _mm256_setzero_pd ();
   1254   __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
   1255 
   1256   return (__m256d) __builtin_ia32_gathersiv4df (src,
   1257 						base,
   1258 						(__v4si)index,
   1259 						mask,
   1260 						scale);
   1261 }
   1262 
   1263 extern __inline __m256d
   1264 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1265 _mm256_mask_i32gather_pd (__m256d src, double const *base,
   1266 			  __m128i index, __m256d mask, const int scale)
   1267 {
   1268   return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
   1269 						base,
   1270 						(__v4si)index,
   1271 						(__v4df)mask,
   1272 						scale);
   1273 }
   1274 
   1275 extern __inline __m128d
   1276 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1277 _mm_i64gather_pd (double const *base, __m128i index, const int scale)
   1278 {
   1279   __v2df src = _mm_setzero_pd ();
   1280   __v2df mask = _mm_cmpeq_pd (src, src);
   1281 
   1282   return (__m128d) __builtin_ia32_gatherdiv2df (src,
   1283 						base,
   1284 						(__v2di)index,
   1285 						mask,
   1286 						scale);
   1287 }
   1288 
   1289 extern __inline __m128d
   1290 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1291 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
   1292 		       __m128d mask, const int scale)
   1293 {
   1294   return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
   1295 						base,
   1296 						(__v2di)index,
   1297 						(__v2df)mask,
   1298 						scale);
   1299 }
   1300 
   1301 extern __inline __m256d
   1302 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1303 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
   1304 {
   1305   __v4df src = _mm256_setzero_pd ();
   1306   __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
   1307 
   1308   return (__m256d) __builtin_ia32_gatherdiv4df (src,
   1309 						base,
   1310 						(__v4di)index,
   1311 						mask,
   1312 						scale);
   1313 }
   1314 
   1315 extern __inline __m256d
   1316 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1317 _mm256_mask_i64gather_pd (__m256d src, double const *base,
   1318 			  __m256i index, __m256d mask, const int scale)
   1319 {
   1320   return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
   1321 						base,
   1322 						(__v4di)index,
   1323 						(__v4df)mask,
   1324 						scale);
   1325 }
   1326 
   1327 extern __inline __m128
   1328 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1329 _mm_i32gather_ps (float const *base, __m128i index, const int scale)
   1330 {
   1331   __v4sf src = _mm_setzero_ps ();
   1332   __v4sf mask = _mm_cmpeq_ps (src, src);
   1333 
   1334   return (__m128) __builtin_ia32_gathersiv4sf (src,
   1335 					       base,
   1336 					       (__v4si)index,
   1337 					       mask,
   1338 					       scale);
   1339 }
   1340 
   1341 extern __inline __m128
   1342 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1343 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
   1344 		       __m128 mask, const int scale)
   1345 {
   1346   return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
   1347 					       base,
   1348 					       (__v4si)index,
   1349 					       (__v4sf)mask,
   1350 					       scale);
   1351 }
   1352 
   1353 extern __inline __m256
   1354 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1355 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
   1356 {
   1357   __v8sf src = _mm256_setzero_ps ();
   1358   __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
   1359 
   1360   return (__m256) __builtin_ia32_gathersiv8sf (src,
   1361 					       base,
   1362 					       (__v8si)index,
   1363 					       mask,
   1364 					       scale);
   1365 }
   1366 
   1367 extern __inline __m256
   1368 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1369 _mm256_mask_i32gather_ps (__m256 src, float const *base,
   1370 			  __m256i index, __m256 mask, const int scale)
   1371 {
   1372   return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
   1373 					       base,
   1374 					       (__v8si)index,
   1375 					       (__v8sf)mask,
   1376 					       scale);
   1377 }
   1378 
   1379 extern __inline __m128
   1380 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1381 _mm_i64gather_ps (float const *base, __m128i index, const int scale)
   1382 {
   1383   __v4sf src = _mm_setzero_ps ();
   1384   __v4sf mask = _mm_cmpeq_ps (src, src);
   1385 
   1386   return (__m128) __builtin_ia32_gatherdiv4sf (src,
   1387 					       base,
   1388 					       (__v2di)index,
   1389 					       mask,
   1390 					       scale);
   1391 }
   1392 
   1393 extern __inline __m128
   1394 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1395 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
   1396 		       __m128 mask, const int scale)
   1397 {
   1398   return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
   1399 						base,
   1400 						(__v2di)index,
   1401 						(__v4sf)mask,
   1402 						scale);
   1403 }
   1404 
   1405 extern __inline __m128
   1406 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1407 _mm256_i64gather_ps (float const *base, __m256i index, const int scale)
   1408 {
   1409   __v4sf src = _mm_setzero_ps ();
   1410   __v4sf mask = _mm_cmpeq_ps (src, src);
   1411 
   1412   return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
   1413 						  base,
   1414 						  (__v4di)index,
   1415 						  mask,
   1416 						  scale);
   1417 }
   1418 
   1419 extern __inline __m128
   1420 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1421 _mm256_mask_i64gather_ps (__m128 src, float const *base,
   1422 			  __m256i index, __m128 mask, const int scale)
   1423 {
   1424   return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
   1425 						  base,
   1426 						  (__v4di)index,
   1427 						  (__v4sf)mask,
   1428 						  scale);
   1429 }
   1430 
   1431 extern __inline __m128i
   1432 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1433 _mm_i32gather_epi64 (long long int const *base,
   1434 		     __m128i index, const int scale)
   1435 {
   1436   __v2di src = __extension__ (__v2di){ 0, 0 };
   1437   __v2di mask = __extension__ (__v2di){ ~0, ~0 };
   1438 
   1439   return (__m128i) __builtin_ia32_gathersiv2di (src,
   1440 						base,
   1441 						(__v4si)index,
   1442 						mask,
   1443 						scale);
   1444 }
   1445 
   1446 extern __inline __m128i
   1447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1448 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
   1449 			  __m128i index, __m128i mask, const int scale)
   1450 {
   1451   return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
   1452 						base,
   1453 						(__v4si)index,
   1454 						(__v2di)mask,
   1455 						scale);
   1456 }
   1457 
   1458 extern __inline __m256i
   1459 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1460 _mm256_i32gather_epi64 (long long int const *base,
   1461 			__m128i index, const int scale)
   1462 {
   1463   __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
   1464   __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
   1465 
   1466   return (__m256i) __builtin_ia32_gathersiv4di (src,
   1467 						base,
   1468 						(__v4si)index,
   1469 						mask,
   1470 						scale);
   1471 }
   1472 
   1473 extern __inline __m256i
   1474 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1475 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
   1476 			     __m128i index, __m256i mask, const int scale)
   1477 {
   1478   return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
   1479 						base,
   1480 						(__v4si)index,
   1481 						(__v4di)mask,
   1482 						scale);
   1483 }
   1484 
   1485 extern __inline __m128i
   1486 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1487 _mm_i64gather_epi64 (long long int const *base,
   1488 		     __m128i index, const int scale)
   1489 {
   1490   __v2di src = __extension__ (__v2di){ 0, 0 };
   1491   __v2di mask = __extension__ (__v2di){ ~0, ~0 };
   1492 
   1493   return (__m128i) __builtin_ia32_gatherdiv2di (src,
   1494 						base,
   1495 						(__v2di)index,
   1496 						mask,
   1497 						scale);
   1498 }
   1499 
   1500 extern __inline __m128i
   1501 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1502 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
   1503 			  __m128i mask, const int scale)
   1504 {
   1505   return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
   1506 						base,
   1507 						(__v2di)index,
   1508 						(__v2di)mask,
   1509 						scale);
   1510 }
   1511 
   1512 extern __inline __m256i
   1513 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1514 _mm256_i64gather_epi64 (long long int const *base,
   1515 			__m256i index, const int scale)
   1516 {
   1517   __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
   1518   __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
   1519 
   1520   return (__m256i) __builtin_ia32_gatherdiv4di (src,
   1521 						base,
   1522 						(__v4di)index,
   1523 						mask,
   1524 						scale);
   1525 }
   1526 
   1527 extern __inline __m256i
   1528 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1529 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
   1530 			     __m256i index, __m256i mask, const int scale)
   1531 {
   1532   return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
   1533 						base,
   1534 						(__v4di)index,
   1535 						(__v4di)mask,
   1536 						scale);
   1537 }
   1538 
   1539 extern __inline __m128i
   1540 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1541 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
   1542 {
   1543   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
   1544   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
   1545 
   1546   return (__m128i) __builtin_ia32_gathersiv4si (src,
   1547 					       base,
   1548 					       (__v4si)index,
   1549 					       mask,
   1550 					       scale);
   1551 }
   1552 
   1553 extern __inline __m128i
   1554 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1555 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
   1556 			  __m128i mask, const int scale)
   1557 {
   1558   return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
   1559 						base,
   1560 						(__v4si)index,
   1561 						(__v4si)mask,
   1562 						scale);
   1563 }
   1564 
   1565 extern __inline __m256i
   1566 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1567 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
   1568 {
   1569   __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
   1570   __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
   1571 
   1572   return (__m256i) __builtin_ia32_gathersiv8si (src,
   1573 						base,
   1574 						(__v8si)index,
   1575 						mask,
   1576 						scale);
   1577 }
   1578 
   1579 extern __inline __m256i
   1580 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1581 _mm256_mask_i32gather_epi32 (__m256i src, int const *base,
   1582 			     __m256i index, __m256i mask, const int scale)
   1583 {
   1584   return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
   1585 						base,
   1586 						(__v8si)index,
   1587 						(__v8si)mask,
   1588 						scale);
   1589 }
   1590 
   1591 extern __inline __m128i
   1592 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1593 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
   1594 {
   1595   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
   1596   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
   1597 
   1598   return (__m128i) __builtin_ia32_gatherdiv4si (src,
   1599 						base,
   1600 						(__v2di)index,
   1601 						mask,
   1602 						scale);
   1603 }
   1604 
   1605 extern __inline __m128i
   1606 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1607 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
   1608 			  __m128i mask, const int scale)
   1609 {
   1610   return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
   1611 						base,
   1612 						(__v2di)index,
   1613 						(__v4si)mask,
   1614 						scale);
   1615 }
   1616 
   1617 extern __inline __m128i
   1618 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1619 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
   1620 {
   1621   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
   1622   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
   1623 
   1624   return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
   1625 						  base,
   1626 						  (__v4di)index,
   1627 						  mask,
   1628 						  scale);
   1629 }
   1630 
   1631 extern __inline __m128i
   1632 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
   1633 _mm256_mask_i64gather_epi32 (__m128i src, int const *base,
   1634 			     __m256i index, __m128i mask, const int scale)
   1635 {
   1636   return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
   1637 						   base,
   1638 						   (__v4di)index,
   1639 						   (__v4si)mask,
   1640 						   scale);
   1641 }
   1642 #else /* __OPTIMIZE__ */
   1643 #define _mm_i32gather_pd(BASE, INDEX, SCALE)				\
   1644   (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),	\
   1645 					 (double const *)BASE,		\
   1646 					 (__v4si)(__m128i)INDEX,	\
   1647 					 (__v2df)_mm_set1_pd(		\
   1648 					   (double)(long long int) -1), \
   1649 					 (int)SCALE)
   1650 
   1651 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1652   (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,	 \
   1653 					 (double const *)BASE,	 \
   1654 					 (__v4si)(__m128i)INDEX, \
   1655 					 (__v2df)(__m128d)MASK,	 \
   1656 					 (int)SCALE)
   1657 
   1658 #define _mm256_i32gather_pd(BASE, INDEX, SCALE)				\
   1659   (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),	\
   1660 					 (double const *)BASE,		\
   1661 					 (__v4si)(__m128i)INDEX,	\
   1662 					 (__v4df)_mm256_set1_pd(	\
   1663 					   (double)(long long int) -1), \
   1664 					 (int)SCALE)
   1665 
   1666 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1667   (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,	 \
   1668 					 (double const *)BASE,	 \
   1669 					 (__v4si)(__m128i)INDEX, \
   1670 					 (__v4df)(__m256d)MASK,	 \
   1671 					 (int)SCALE)
   1672 
   1673 #define _mm_i64gather_pd(BASE, INDEX, SCALE)				\
   1674   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),	\
   1675 					 (double const *)BASE,		\
   1676 					 (__v2di)(__m128i)INDEX,	\
   1677 					 (__v2df)_mm_set1_pd(		\
   1678 					   (double)(long long int) -1), \
   1679 					 (int)SCALE)
   1680 
   1681 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1682   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,	 \
   1683 					 (double const *)BASE,	 \
   1684 					 (__v2di)(__m128i)INDEX, \
   1685 					 (__v2df)(__m128d)MASK,	 \
   1686 					 (int)SCALE)
   1687 
   1688 #define _mm256_i64gather_pd(BASE, INDEX, SCALE)				\
   1689   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),	\
   1690 					 (double const *)BASE,		\
   1691 					 (__v4di)(__m256i)INDEX,	\
   1692 					 (__v4df)_mm256_set1_pd(	\
   1693 					   (double)(long long int) -1), \
   1694 					 (int)SCALE)
   1695 
   1696 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   1697   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,	 \
   1698 					 (double const *)BASE,	 \
   1699 					 (__v4di)(__m256i)INDEX, \
   1700 					 (__v4df)(__m256d)MASK,	 \
   1701 					 (int)SCALE)
   1702 
   1703 #define _mm_i32gather_ps(BASE, INDEX, SCALE)				\
   1704   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),	\
   1705 					(float const *)BASE,		\
   1706 					(__v4si)(__m128i)INDEX,		\
   1707 					_mm_set1_ps ((float)(int) -1),	\
   1708 					(int)SCALE)
   1709 
   1710 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
   1711   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,	 \
   1712 					(float const *)BASE,	 \
   1713 					(__v4si)(__m128i)INDEX,	 \
   1714 					(__v4sf)(__m128d)MASK,	 \
   1715 					(int)SCALE)
   1716 
   1717 #define _mm256_i32gather_ps(BASE, INDEX, SCALE)			       \
   1718   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
   1719 					(float const *)BASE,	       \
   1720 					(__v8si)(__m256i)INDEX,	       \
   1721 					(__v8sf)_mm256_set1_ps (       \
   1722 					  (float)(int) -1),	       \
   1723 					(int)SCALE)
   1724 
   1725 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
   1726   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,	\
   1727 					(float const *)BASE,	\
   1728 					(__v8si)(__m256i)INDEX, \
   1729 					(__v8sf)(__m256d)MASK,	\
   1730 					(int)SCALE)
   1731 
   1732 #define _mm_i64gather_ps(BASE, INDEX, SCALE)				\
   1733   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),	\
   1734 					(float const *)BASE,		\
   1735 					(__v2di)(__m128i)INDEX,		\
   1736 					(__v4sf)_mm_set1_ps (		\
   1737 					  (float)(int) -1),		\
   1738 					(int)SCALE)
   1739 
   1740 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
   1741   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,	 \
   1742 					(float const *)BASE,	 \
   1743 					(__v2di)(__m128i)INDEX,	 \
   1744 					(__v4sf)(__m128d)MASK,	 \
   1745 					(int)SCALE)
   1746 
   1747 #define _mm256_i64gather_ps(BASE, INDEX, SCALE)				\
   1748   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (),	\
   1749 					   (float const *)BASE,		\
   1750 					   (__v4di)(__m256i)INDEX,	\
   1751 					   (__v4sf)_mm_set1_ps(		\
   1752 					     (float)(int) -1),		\
   1753 					   (int)SCALE)
   1754 
   1755 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	   \
   1756   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,	   \
   1757 					   (float const *)BASE,	   \
   1758 					   (__v4di)(__m256i)INDEX, \
   1759 					   (__v4sf)(__m128)MASK,   \
   1760 					   (int)SCALE)
   1761 
   1762 #define _mm_i32gather_epi64(BASE, INDEX, SCALE)				\
   1763   (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
   1764 					 (long long const *)BASE,	\
   1765 					 (__v4si)(__m128i)INDEX,	\
   1766 					 (__v2di)_mm_set1_epi64x (-1),	\
   1767 					 (int)SCALE)
   1768 
   1769 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
   1770   (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC,	  \
   1771 					 (long long const *)BASE, \
   1772 					 (__v4si)(__m128i)INDEX,  \
   1773 					 (__v2di)(__m128i)MASK,	  \
   1774 					 (int)SCALE)
   1775 
   1776 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE)			   \
   1777   (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
   1778 					 (long long const *)BASE,	   \
   1779 					 (__v4si)(__m128i)INDEX,	   \
   1780 					 (__v4di)_mm256_set1_epi64x (-1),  \
   1781 					 (int)SCALE)
   1782 
   1783 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
   1784   (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC,	   \
   1785 					 (long long const *)BASE,  \
   1786 					 (__v4si)(__m128i)INDEX,   \
   1787 					 (__v4di)(__m256i)MASK,	   \
   1788 					 (int)SCALE)
   1789 
   1790 #define _mm_i64gather_epi64(BASE, INDEX, SCALE)				\
   1791   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
   1792 					 (long long const *)BASE,	\
   1793 					 (__v2di)(__m128i)INDEX,	\
   1794 					 (__v2di)_mm_set1_epi64x (-1),	\
   1795 					 (int)SCALE)
   1796 
   1797 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
   1798   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC,	  \
   1799 					 (long long const *)BASE, \
   1800 					 (__v2di)(__m128i)INDEX,  \
   1801 					 (__v2di)(__m128i)MASK,	  \
   1802 					 (int)SCALE)
   1803 
   1804 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE)			   \
   1805   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
   1806 					 (long long const *)BASE,	   \
   1807 					 (__v4di)(__m256i)INDEX,	   \
   1808 					 (__v4di)_mm256_set1_epi64x (-1),  \
   1809 					 (int)SCALE)
   1810 
   1811 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
   1812   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC,	   \
   1813 					 (long long const *)BASE,  \
   1814 					 (__v4di)(__m256i)INDEX,   \
   1815 					 (__v4di)(__m256i)MASK,	   \
   1816 					 (int)SCALE)
   1817 
   1818 #define _mm_i32gather_epi32(BASE, INDEX, SCALE)				\
   1819   (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (),	\
   1820 					 (int const *)BASE,		\
   1821 					 (__v4si)(__m128i)INDEX,	\
   1822 					 (__v4si)_mm_set1_epi32 (-1),	\
   1823 					 (int)SCALE)
   1824 
   1825 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1826   (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC,	\
   1827 					(int const *)BASE,	\
   1828 					(__v4si)(__m128i)INDEX, \
   1829 					(__v4si)(__m128i)MASK,	\
   1830 					(int)SCALE)
   1831 
   1832 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE)			   \
   1833   (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
   1834 					 (int const *)BASE,		   \
   1835 					 (__v8si)(__m256i)INDEX,	   \
   1836 					 (__v8si)_mm256_set1_epi32 (-1),   \
   1837 					 (int)SCALE)
   1838 
   1839 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1840   (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC,	   \
   1841 					(int const *)BASE,	   \
   1842 					(__v8si)(__m256i)INDEX,	   \
   1843 					(__v8si)(__m256i)MASK,	   \
   1844 					(int)SCALE)
   1845 
   1846 #define _mm_i64gather_epi32(BASE, INDEX, SCALE)				\
   1847   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (),	\
   1848 					 (int const *)BASE,		\
   1849 					 (__v2di)(__m128i)INDEX,	\
   1850 					 (__v4si)_mm_set1_epi32 (-1),	\
   1851 					 (int)SCALE)
   1852 
   1853 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1854   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC,	\
   1855 					(int const *)BASE,	\
   1856 					(__v2di)(__m128i)INDEX, \
   1857 					(__v4si)(__m128i)MASK,	\
   1858 					(int)SCALE)
   1859 
   1860 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE)			   \
   1861   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
   1862 					    (int const *)BASE,		   \
   1863 					    (__v4di)(__m256i)INDEX,	   \
   1864 					    (__v4si)_mm_set1_epi32(-1),	   \
   1865 					    (int)SCALE)
   1866 
   1867 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
   1868   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC,  \
   1869 					   (int const *)BASE,	   \
   1870 					   (__v4di)(__m256i)INDEX, \
   1871 					   (__v4si)(__m128i)MASK,  \
   1872 					   (int)SCALE)
   1873 #endif  /* __OPTIMIZE__ */
   1874