Home | History | Annotate | Download | only in clang-include
      1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __IMMINTRIN_H
     25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
     26 #endif
     27 
     28 typedef double __v4df __attribute__ ((__vector_size__ (32)));
     29 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
     30 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
     31 typedef int __v8si __attribute__ ((__vector_size__ (32)));
     32 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
     33 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
     34 
     35 typedef float __m256 __attribute__ ((__vector_size__ (32)));
     36 typedef double __m256d __attribute__((__vector_size__(32)));
     37 typedef long long __m256i __attribute__((__vector_size__(32)));
     38 
     39 /* Arithmetic */
     40 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
     41 _mm256_add_pd(__m256d a, __m256d b)
     42 {
     43   return a+b;
     44 }
     45 
     46 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
     47 _mm256_add_ps(__m256 a, __m256 b)
     48 {
     49   return a+b;
     50 }
     51 
     52 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
     53 _mm256_sub_pd(__m256d a, __m256d b)
     54 {
     55   return a-b;
     56 }
     57 
     58 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
     59 _mm256_sub_ps(__m256 a, __m256 b)
     60 {
     61   return a-b;
     62 }
     63 
     64 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
     65 _mm256_addsub_pd(__m256d a, __m256d b)
     66 {
     67   return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
     68 }
     69 
     70 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
     71 _mm256_addsub_ps(__m256 a, __m256 b)
     72 {
     73   return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
     74 }
     75 
     76 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
     77 _mm256_div_pd(__m256d a, __m256d b)
     78 {
     79   return a / b;
     80 }
     81 
     82 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
     83 _mm256_div_ps(__m256 a, __m256 b)
     84 {
     85   return a / b;
     86 }
     87 
     88 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
     89 _mm256_max_pd(__m256d a, __m256d b)
     90 {
     91   return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
     92 }
     93 
     94 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
     95 _mm256_max_ps(__m256 a, __m256 b)
     96 {
     97   return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
     98 }
     99 
    100 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    101 _mm256_min_pd(__m256d a, __m256d b)
    102 {
    103   return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
    104 }
    105 
    106 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    107 _mm256_min_ps(__m256 a, __m256 b)
    108 {
    109   return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
    110 }
    111 
    112 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    113 _mm256_mul_pd(__m256d a, __m256d b)
    114 {
    115   return a * b;
    116 }
    117 
    118 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    119 _mm256_mul_ps(__m256 a, __m256 b)
    120 {
    121   return a * b;
    122 }
    123 
    124 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    125 _mm256_sqrt_pd(__m256d a)
    126 {
    127   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
    128 }
    129 
    130 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    131 _mm256_sqrt_ps(__m256 a)
    132 {
    133   return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
    134 }
    135 
    136 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    137 _mm256_rsqrt_ps(__m256 a)
    138 {
    139   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
    140 }
    141 
    142 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    143 _mm256_rcp_ps(__m256 a)
    144 {
    145   return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
    146 }
    147 
    148 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    149 _mm256_round_pd(__m256d v, const int m)
    150 {
    151   return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
    152 }
    153 
    154 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    155 _mm256_round_ps(__m256 v, const int m)
    156 {
    157   return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
    158 }
    159 
    160 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
    161 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
    162 #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
    163 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
    164 
    165 /* Logical */
    166 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    167 _mm256_and_pd(__m256d a, __m256d b)
    168 {
    169   return (__m256d)((__v4di)a & (__v4di)b);
    170 }
    171 
    172 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    173 _mm256_and_ps(__m256 a, __m256 b)
    174 {
    175   return (__m256)((__v8si)a & (__v8si)b);
    176 }
    177 
    178 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    179 _mm256_andnot_pd(__m256d a, __m256d b)
    180 {
    181   return (__m256d)(~(__v4di)a & (__v4di)b);
    182 }
    183 
    184 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    185 _mm256_andnot_ps(__m256 a, __m256 b)
    186 {
    187   return (__m256)(~(__v8si)a & (__v8si)b);
    188 }
    189 
    190 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    191 _mm256_or_pd(__m256d a, __m256d b)
    192 {
    193   return (__m256d)((__v4di)a | (__v4di)b);
    194 }
    195 
    196 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    197 _mm256_or_ps(__m256 a, __m256 b)
    198 {
    199   return (__m256)((__v8si)a | (__v8si)b);
    200 }
    201 
    202 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    203 _mm256_xor_pd(__m256d a, __m256d b)
    204 {
    205   return (__m256d)((__v4di)a ^ (__v4di)b);
    206 }
    207 
    208 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    209 _mm256_xor_ps(__m256 a, __m256 b)
    210 {
    211   return (__m256)((__v8si)a ^ (__v8si)b);
    212 }
    213 
    214 /* Horizontal arithmetic */
    215 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    216 _mm256_hadd_pd(__m256d a, __m256d b)
    217 {
    218   return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
    219 }
    220 
    221 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    222 _mm256_hadd_ps(__m256 a, __m256 b)
    223 {
    224   return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
    225 }
    226 
    227 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    228 _mm256_hsub_pd(__m256d a, __m256d b)
    229 {
    230   return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
    231 }
    232 
    233 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    234 _mm256_hsub_ps(__m256 a, __m256 b)
    235 {
    236   return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
    237 }
    238 
    239 /* Vector permutations */
    240 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
    241 _mm_permutevar_pd(__m128d a, __m128i c)
    242 {
    243   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
    244 }
    245 
    246 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    247 _mm256_permutevar_pd(__m256d a, __m256i c)
    248 {
    249   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
    250 }
    251 
    252 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    253 _mm_permutevar_ps(__m128 a, __m128i c)
    254 {
    255   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
    256 }
    257 
    258 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    259 _mm256_permutevar_ps(__m256 a, __m256i c)
    260 {
    261   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
    262 						  (__v8si)c);
    263 }
    264 
    265 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
    266 _mm_permute_pd(__m128d a, const int c)
    267 {
    268   return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
    269 }
    270 
    271 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    272 _mm256_permute_pd(__m256d a, const int c)
    273 {
    274   return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
    275 }
    276 
    277 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    278 _mm_permute_ps(__m128 a, const int c)
    279 {
    280   return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
    281 }
    282 
    283 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    284 _mm256_permute_ps(__m256 a, const int c)
    285 {
    286   return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
    287 }
    288 
    289 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    290 _mm256_permute2f128_pd(__m256d a, __m256d b, const int c)
    291 {
    292   return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c);
    293 }
    294 
    295 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    296 _mm256_permute2f128_ps(__m256 a, __m256 b, const int c)
    297 {
    298   return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c);
    299 }
    300 
    301 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    302 _mm256_permute2f128_si256(__m256i a, __m256i b, const int c)
    303 {
    304   return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c);
    305 }
    306 
    307 /* Vector Blend */
    308 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    309 _mm256_blend_pd(__m256d a, __m256d b, const int c)
    310 {
    311   return (__m256d)__builtin_ia32_blendpd256((__v4df)a, (__v4df)b, c);
    312 }
    313 
    314 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    315 _mm256_blend_ps(__m256 a, __m256 b, const int c)
    316 {
    317   return (__m256)__builtin_ia32_blendps256((__v8sf)a, (__v8sf)b, c);
    318 }
    319 
    320 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    321 _mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
    322 {
    323   return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
    324 }
    325 
    326 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    327 _mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
    328 {
    329   return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
    330 }
    331 
    332 /* Vector Dot Product */
    333 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    334 _mm256_dp_ps(__m256 a, __m256 b, const int c)
    335 {
    336   return (__m256)__builtin_ia32_dpps256((__v8sf)a, (__v8sf)b, c);
    337 }
    338 
    339 /* Vector shuffle */
    340 #define _mm256_shuffle_ps(a, b, mask) \
    341         (__builtin_shufflevector((__v8sf)(a), (__v8sf)(b), \
    342         (mask) & 0x3,                ((mask) & 0xc) >> 2, \
    343         (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
    344         (mask) & 0x3 + 4,            (((mask) & 0xc) >> 2) + 4, \
    345         (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12))
    346 
    347 #define _mm256_shuffle_pd(a, b, mask) \
    348         (__builtin_shufflevector((__v4df)(a), (__v4df)(b), \
    349         (mask) & 0x1, \
    350         (((mask) & 0x2) >> 1) + 4, \
    351         (((mask) & 0x4) >> 2) + 2, \
    352         (((mask) & 0x8) >> 3) + 6))
    353 
    354 /* Compare */
    355 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
    356 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
    357 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
    358 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
    359 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
    360 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
    361 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
    362 #define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
    363 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
    364 #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
    365 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
    366 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
    367 #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
    368 #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
    369 #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
    370 #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
    371 #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
    372 #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
    373 #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
    374 #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
    375 #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
    376 #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
    377 #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
    378 #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
    379 #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
    380 #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
    381 #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
    382 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
    383 #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
    384 #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
    385 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
    386 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
    387 
    388 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
    389 _mm_cmp_pd(__m128d a, __m128d b, const int c)
    390 {
    391   return (__m128d)__builtin_ia32_cmppd((__v2df)a, (__v2df)b, c);
    392 }
    393 
    394 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    395 _mm_cmp_ps(__m128 a, __m128 b, const int c)
    396 {
    397   return (__m128)__builtin_ia32_cmpps((__v4sf)a, (__v4sf)b, c);
    398 }
    399 
    400 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    401 _mm256_cmp_pd(__m256d a, __m256d b, const int c)
    402 {
    403   return (__m256d)__builtin_ia32_cmppd256((__v4df)a, (__v4df)b, c);
    404 }
    405 
    406 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    407 _mm256_cmp_ps(__m256 a, __m256 b, const int c)
    408 {
    409   return (__m256)__builtin_ia32_cmpps256((__v8sf)a, (__v8sf)b, c);
    410 }
    411 
    412 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
    413 _mm_cmp_sd(__m128d a, __m128d b, const int c)
    414 {
    415   return (__m128d)__builtin_ia32_cmpsd((__v2df)a, (__v2df)b, c);
    416 }
    417 
    418 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    419 _mm_cmp_ss(__m128 a, __m128 b, const int c)
    420 {
    421   return (__m128)__builtin_ia32_cmpss((__v4sf)a, (__v4sf)b, c);
    422 }
    423 
    424 /* Vector extract */
    425 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
    426 _mm256_extractf128_pd(__m256d a, const int o)
    427 {
    428   return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
    429 }
    430 
    431 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    432 _mm256_extractf128_ps(__m256 a, const int o)
    433 {
    434   return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
    435 }
    436 
    437 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
    438 _mm256_extractf128_si256(__m256i a, const int o)
    439 {
    440   return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
    441 }
    442 
    443 static __inline int __attribute__((__always_inline__, __nodebug__))
    444 _mm256_extract_epi32(__m256i a, int const imm)
    445 {
    446   __v8si b = (__v8si)a;
    447   return b[imm];
    448 }
    449 
    450 static __inline int __attribute__((__always_inline__, __nodebug__))
    451 _mm256_extract_epi16(__m256i a, int const imm)
    452 {
    453   __v16hi b = (__v16hi)a;
    454   return b[imm];
    455 }
    456 
    457 static __inline int __attribute__((__always_inline__, __nodebug__))
    458 _mm256_extract_epi8(__m256i a, int const imm)
    459 {
    460   __v32qi b = (__v32qi)a;
    461   return b[imm];
    462 }
    463 
    464 #ifdef __x86_64__
    465 static __inline long long  __attribute__((__always_inline__, __nodebug__))
    466 _mm256_extract_epi64(__m256i a, const int imm)
    467 {
    468   __v4di b = (__v4di)a;
    469   return b[imm];
    470 }
    471 #endif
    472 
    473 /* Vector insert */
    474 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    475 _mm256_insertf128_pd(__m256d a, __m128d b, const int o)
    476 {
    477   return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o);
    478 }
    479 
    480 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    481 _mm256_insertf128_ps(__m256 a, __m128 b, const int o)
    482 {
    483   return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o);
    484 }
    485 
    486 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    487 _mm256_insertf128_si256(__m256i a, __m128i b, const int o)
    488 {
    489   return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o);
    490 }
    491 
    492 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    493 _mm256_insert_epi32(__m256i a, int b, int const imm)
    494 {
    495   __v8si c = (__v8si)a;
    496   c[imm & 7] = b;
    497   return (__m256i)c;
    498 }
    499 
    500 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    501 _mm256_insert_epi16(__m256i a, int b, int const imm)
    502 {
    503   __v16hi c = (__v16hi)a;
    504   c[imm & 15] = b;
    505   return (__m256i)c;
    506 }
    507 
    508 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    509 _mm256_insert_epi8(__m256i a, int b, int const imm)
    510 {
    511   __v32qi c = (__v32qi)a;
    512   c[imm & 31] = b;
    513   return (__m256i)c;
    514 }
    515 
    516 #ifdef __x86_64__
    517 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    518 _mm256_insert_epi64(__m256i a, int b, int const imm)
    519 {
    520   __v4di c = (__v4di)a;
    521   c[imm & 3] = b;
    522   return (__m256i)c;
    523 }
    524 #endif
    525 
    526 /* Conversion */
    527 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    528 _mm256_cvtepi32_pd(__m128i a)
    529 {
    530   return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
    531 }
    532 
    533 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    534 _mm256_cvtepi32_ps(__m256i a)
    535 {
    536   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
    537 }
    538 
    539 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    540 _mm256_cvtpd_ps(__m256d a)
    541 {
    542   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
    543 }
    544 
    545 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    546 _mm256_cvtps_epi32(__m256 a)
    547 {
    548   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
    549 }
    550 
    551 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    552 _mm256_cvtps_pd(__m128 a)
    553 {
    554   return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
    555 }
    556 
    557 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
    558 _mm256_cvttpd_epi32(__m256d a)
    559 {
    560   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
    561 }
    562 
    563 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
    564 _mm256_cvtpd_epi32(__m256d a)
    565 {
    566   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
    567 }
    568 
    569 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    570 _mm256_cvttps_epi32(__m256 a)
    571 {
    572   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
    573 }
    574 
    575 /* Vector replicate */
    576 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    577 _mm256_movehdup_ps(__m256 a)
    578 {
    579   return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
    580 }
    581 
    582 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    583 _mm256_moveldup_ps(__m256 a)
    584 {
    585   return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
    586 }
    587 
    588 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    589 _mm256_movedup_pd(__m256d a)
    590 {
    591   return __builtin_shufflevector(a, a, 0, 0, 2, 2);
    592 }
    593 
    594 /* Unpack and Interleave */
    595 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    596 _mm256_unpackhi_pd(__m256d a, __m256d b)
    597 {
    598   return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
    599 }
    600 
    601 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    602 _mm256_unpacklo_pd(__m256d a, __m256d b)
    603 {
    604   return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
    605 }
    606 
    607 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    608 _mm256_unpackhi_ps(__m256 a, __m256 b)
    609 {
    610   return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
    611 }
    612 
    613 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    614 _mm256_unpacklo_ps(__m256 a, __m256 b)
    615 {
    616   return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
    617 }
    618 
    619 /* Bit Test */
    620 static __inline int __attribute__((__always_inline__, __nodebug__))
    621 _mm_testz_pd(__m128d a, __m128d b)
    622 {
    623   return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
    624 }
    625 
    626 static __inline int __attribute__((__always_inline__, __nodebug__))
    627 _mm_testc_pd(__m128d a, __m128d b)
    628 {
    629   return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
    630 }
    631 
    632 static __inline int __attribute__((__always_inline__, __nodebug__))
    633 _mm_testnzc_pd(__m128d a, __m128d b)
    634 {
    635   return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
    636 }
    637 
    638 static __inline int __attribute__((__always_inline__, __nodebug__))
    639 _mm_testz_ps(__m128 a, __m128 b)
    640 {
    641   return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
    642 }
    643 
    644 static __inline int __attribute__((__always_inline__, __nodebug__))
    645 _mm_testc_ps(__m128 a, __m128 b)
    646 {
    647   return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
    648 }
    649 
    650 static __inline int __attribute__((__always_inline__, __nodebug__))
    651 _mm_testnzc_ps(__m128 a, __m128 b)
    652 {
    653   return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
    654 }
    655 
    656 static __inline int __attribute__((__always_inline__, __nodebug__))
    657 _mm256_testz_pd(__m256d a, __m256d b)
    658 {
    659   return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
    660 }
    661 
    662 static __inline int __attribute__((__always_inline__, __nodebug__))
    663 _mm256_testc_pd(__m256d a, __m256d b)
    664 {
    665   return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
    666 }
    667 
    668 static __inline int __attribute__((__always_inline__, __nodebug__))
    669 _mm256_testnzc_pd(__m256d a, __m256d b)
    670 {
    671   return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
    672 }
    673 
    674 static __inline int __attribute__((__always_inline__, __nodebug__))
    675 _mm256_testz_ps(__m256 a, __m256 b)
    676 {
    677   return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
    678 }
    679 
    680 static __inline int __attribute__((__always_inline__, __nodebug__))
    681 _mm256_testc_ps(__m256 a, __m256 b)
    682 {
    683   return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
    684 }
    685 
    686 static __inline int __attribute__((__always_inline__, __nodebug__))
    687 _mm256_testnzc_ps(__m256 a, __m256 b)
    688 {
    689   return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
    690 }
    691 
    692 static __inline int __attribute__((__always_inline__, __nodebug__))
    693 _mm256_testz_si256(__m256i a, __m256i b)
    694 {
    695   return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
    696 }
    697 
    698 static __inline int __attribute__((__always_inline__, __nodebug__))
    699 _mm256_testc_si256(__m256i a, __m256i b)
    700 {
    701   return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
    702 }
    703 
    704 static __inline int __attribute__((__always_inline__, __nodebug__))
    705 _mm256_testnzc_si256(__m256i a, __m256i b)
    706 {
    707   return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
    708 }
    709 
    710 /* Vector extract sign mask */
    711 static __inline int __attribute__((__always_inline__, __nodebug__))
    712 _mm256_movemask_pd(__m256d a)
    713 {
    714   return __builtin_ia32_movmskpd256((__v4df)a);
    715 }
    716 
    717 static __inline int __attribute__((__always_inline__, __nodebug__))
    718 _mm256_movemask_ps(__m256 a)
    719 {
    720   return __builtin_ia32_movmskps256((__v8sf)a);
    721 }
    722 
    723 /* Vector zero */
    724 static __inline void __attribute__((__always_inline__, __nodebug__))
    725 _mm256_zeroall(void)
    726 {
    727   __builtin_ia32_vzeroall();
    728 }
    729 
    730 static __inline void __attribute__((__always_inline__, __nodebug__))
    731 _mm256_zeroupper(void)
    732 {
    733   __builtin_ia32_vzeroupper();
    734 }
    735 
    736 /* Vector load with broadcast */
    737 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    738 _mm_broadcast_ss(float const *a)
    739 {
    740   return (__m128)__builtin_ia32_vbroadcastss(a);
    741 }
    742 
    743 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    744 _mm256_broadcast_sd(double const *a)
    745 {
    746   return (__m256d)__builtin_ia32_vbroadcastsd256(a);
    747 }
    748 
    749 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    750 _mm256_broadcast_ss(float const *a)
    751 {
    752   return (__m256)__builtin_ia32_vbroadcastss256(a);
    753 }
    754 
    755 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    756 _mm256_broadcast_pd(__m128d const *a)
    757 {
    758   return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
    759 }
    760 
    761 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    762 _mm256_broadcast_ps(__m128 const *a)
    763 {
    764   return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
    765 }
    766 
    767 /* SIMD load ops */
    768 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    769 _mm256_load_pd(double const *p)
    770 {
    771   return *(__m256d *)p;
    772 }
    773 
    774 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    775 _mm256_load_ps(float const *p)
    776 {
    777   return *(__m256 *)p;
    778 }
    779 
    780 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    781 _mm256_loadu_pd(double const *p)
    782 {
    783   return (__m256d)__builtin_ia32_loadupd256(p);
    784 }
    785 
    786 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    787 _mm256_loadu_ps(float const *p)
    788 {
    789   return (__m256)__builtin_ia32_loadups256(p);
    790 }
    791 
    792 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    793 _mm256_load_si256(__m256i const *p)
    794 {
    795   return *p;
    796 }
    797 
    798 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    799 _mm256_loadu_si256(__m256i const *p)
    800 {
    801   return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
    802 }
    803 
    804 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    805 _mm256_lddqu_si256(__m256i const *p)
    806 {
    807   return (__m256i)__builtin_ia32_lddqu256((char const *)p);
    808 }
    809 
    810 /* SIMD store ops */
    811 static __inline void __attribute__((__always_inline__, __nodebug__))
    812 _mm256_store_pd(double *p, __m256d a)
    813 {
    814   *(__m256d *)p = a;
    815 }
    816 
    817 static __inline void __attribute__((__always_inline__, __nodebug__))
    818 _mm256_store_ps(float *p, __m256 a)
    819 {
    820   *(__m256 *)p = a;
    821 }
    822 
    823 static __inline void __attribute__((__always_inline__, __nodebug__))
    824 _mm256_storeu_pd(double *p, __m256d a)
    825 {
    826   __builtin_ia32_storeupd256(p, (__v4df)a);
    827 }
    828 
    829 static __inline void __attribute__((__always_inline__, __nodebug__))
    830 _mm256_storeu_ps(float *p, __m256 a)
    831 {
    832   __builtin_ia32_storeups256(p, (__v8sf)a);
    833 }
    834 
    835 static __inline void __attribute__((__always_inline__, __nodebug__))
    836 _mm256_store_si256(__m256i *p, __m256i a)
    837 {
    838   *p = a;
    839 }
    840 
    841 static __inline void __attribute__((__always_inline__, __nodebug__))
    842 _mm256_storeu_si256(__m256i *p, __m256i a)
    843 {
    844   __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
    845 }
    846 
    847 /* Conditional load ops */
    848 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
    849 _mm_maskload_pd(double const *p, __m128d m)
    850 {
    851   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
    852 }
    853 
    854 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    855 _mm256_maskload_pd(double const *p, __m256d m)
    856 {
    857   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
    858 }
    859 
    860 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    861 _mm_maskload_ps(float const *p, __m128 m)
    862 {
    863   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
    864 }
    865 
    866 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    867 _mm256_maskload_ps(float const *p, __m256 m)
    868 {
    869   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
    870 }
    871 
    872 /* Conditional store ops */
    873 static __inline void __attribute__((__always_inline__, __nodebug__))
    874 _mm256_maskstore_ps(float *p, __m256 m, __m256 a)
    875 {
    876   __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
    877 }
    878 
    879 static __inline void __attribute__((__always_inline__, __nodebug__))
    880 _mm_maskstore_pd(double *p, __m128d m, __m128d a)
    881 {
    882   __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
    883 }
    884 
    885 static __inline void __attribute__((__always_inline__, __nodebug__))
    886 _mm256_maskstore_pd(double *p, __m256d m, __m256d a)
    887 {
    888   __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
    889 }
    890 
    891 static __inline void __attribute__((__always_inline__, __nodebug__))
    892 _mm_maskstore_ps(float *p, __m128 m, __m128 a)
    893 {
    894   __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
    895 }
    896 
    897 /* Cacheability support ops */
    898 static __inline void __attribute__((__always_inline__, __nodebug__))
    899 _mm256_stream_si256(__m256i *a, __m256i b)
    900 {
    901   __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
    902 }
    903 
    904 static __inline void __attribute__((__always_inline__, __nodebug__))
    905 _mm256_stream_pd(double *a, __m256d b)
    906 {
    907   __builtin_ia32_movntpd256(a, (__v4df)b);
    908 }
    909 
    910 static __inline void __attribute__((__always_inline__, __nodebug__))
    911 _mm256_stream_ps(float *p, __m256 a)
    912 {
    913   __builtin_ia32_movntps256(p, (__v8sf)a);
    914 }
    915 
    916 /* Create vectors */
    917 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    918 _mm256_set_pd(double a, double b, double c, double d)
    919 {
    920   return (__m256d){ d, c, b, a };
    921 }
    922 
    923 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    924 _mm256_set_ps(float a, float b, float c, float d,
    925 	            float e, float f, float g, float h)
    926 {
    927   return (__m256){ h, g, f, e, d, c, b, a };
    928 }
    929 
    930 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    931 _mm256_set_epi32(int i0, int i1, int i2, int i3,
    932 		             int i4, int i5, int i6, int i7)
    933 {
    934   return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
    935 }
    936 
    937 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    938 _mm256_set_epi16(short w15, short w14, short w13, short w12,
    939 		             short w11, short w10, short w09, short w08,
    940 		             short w07, short w06, short w05, short w04,
    941 		             short w03, short w02, short w01, short w00)
    942 {
    943   return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
    944                              w08, w09, w10, w11, w12, w13, w14, w15 };
    945 }
    946 
    947 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    948 _mm256_set_epi8(char b31, char b30, char b29, char b28,
    949 		            char b27, char b26, char b25, char b24,
    950 		            char b23, char b22, char b21, char b20,
    951 		            char b19, char b18, char b17, char b16,
    952 		            char b15, char b14, char b13, char b12,
    953 		            char b11, char b10, char b09, char b08,
    954 		            char b07, char b06, char b05, char b04,
    955 		            char b03, char b02, char b01, char b00)
    956 {
    957   return (__m256i)(__v32qi){
    958     b00, b01, b02, b03, b04, b05, b06, b07,
    959     b08, b09, b10, b11, b12, b13, b14, b15,
    960     b16, b17, b18, b19, b20, b21, b22, b23,
    961     b24, b25, b26, b27, b28, b29, b30, b31
    962   };
    963 }
    964 
    965 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    966 _mm256_set_epi64x(long long a, long long b, long long c, long long d)
    967 {
    968   return (__m256i)(__v4di){ d, c, b, a };
    969 }
    970 
    971 /* Create vectors with elements in reverse order */
    972 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    973 _mm256_setr_pd(double a, double b, double c, double d)
    974 {
    975   return (__m256d){ a, b, c, d };
    976 }
    977 
    978 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    979 _mm256_setr_ps(float a, float b, float c, float d,
    980 		           float e, float f, float g, float h)
    981 {
    982   return (__m256){ a, b, c, d, e, f, g, h };
    983 }
    984 
    985 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    986 _mm256_setr_epi32(int i0, int i1, int i2, int i3,
    987 		              int i4, int i5, int i6, int i7)
    988 {
    989   return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
    990 }
    991 
    992 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    993 _mm256_setr_epi16(short w15, short w14, short w13, short w12,
    994 		   short w11, short w10, short w09, short w08,
    995 		   short w07, short w06, short w05, short w04,
    996 		   short w03, short w02, short w01, short w00)
    997 {
    998   return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
    999 			                       w07, w06, w05, w04, w03, w02, w01, w00 };
   1000 }
   1001 
   1002 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1003 _mm256_setr_epi8(char b31, char b30, char b29, char b28,
   1004 		             char b27, char b26, char b25, char b24,
   1005 		             char b23, char b22, char b21, char b20,
   1006 		             char b19, char b18, char b17, char b16,
   1007 		             char b15, char b14, char b13, char b12,
   1008 		             char b11, char b10, char b09, char b08,
   1009 		             char b07, char b06, char b05, char b04,
   1010 		             char b03, char b02, char b01, char b00)
   1011 {
   1012   return (__m256i)(__v32qi){
   1013     b31, b30, b29, b28, b27, b26, b25, b24,
   1014 		b23, b22, b21, b20, b19, b18, b17, b16,
   1015 		b15, b14, b13, b12, b11, b10, b09, b08,
   1016 		b07, b06, b05, b04, b03, b02, b01, b00 };
   1017 }
   1018 
   1019 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1020 _mm256_setr_epi64x(long long a, long long b, long long c, long long d)
   1021 {
   1022   return (__m256i)(__v4di){ a, b, c, d };
   1023 }
   1024 
   1025 /* Create vectors with repeated elements */
   1026 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
   1027 _mm256_set1_pd(double w)
   1028 {
   1029   return (__m256d){ w, w, w, w };
   1030 }
   1031 
   1032 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
   1033 _mm256_set1_ps(float w)
   1034 {
   1035   return (__m256){ w, w, w, w, w, w, w, w };
   1036 }
   1037 
   1038 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1039 _mm256_set1_epi32(int i)
   1040 {
   1041   return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
   1042 }
   1043 
   1044 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1045 _mm256_set1_epi16(short w)
   1046 {
   1047   return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
   1048 }
   1049 
   1050 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1051 _mm256_set1_epi8(char b)
   1052 {
   1053   return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
   1054                              b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
   1055 }
   1056 
   1057 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1058 _mm256_set1_epi64x(long long q)
   1059 {
   1060   return (__m256i)(__v4di){ q, q, q, q };
   1061 }
   1062 
   1063 /* Create zeroed vectors */
   1064 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
   1065 _mm256_setzero_pd(void)
   1066 {
   1067   return (__m256d){ 0, 0, 0, 0 };
   1068 }
   1069 
   1070 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
   1071 _mm256_setzero_ps(void)
   1072 {
   1073   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
   1074 }
   1075 
   1076 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1077 _mm256_setzero_si256(void)
   1078 {
   1079   return (__m256i){ 0LL, 0LL, 0LL, 0LL };
   1080 }
   1081 
   1082 /* Cast between vector types */
   1083 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
   1084 _mm256_castpd_ps(__m256d in)
   1085 {
   1086   return (__m256)in;
   1087 }
   1088 
   1089 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1090 _mm256_castpd_si256(__m256d in)
   1091 {
   1092   return (__m256i)in;
   1093 }
   1094 
   1095 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
   1096 _mm256_castps_pd(__m256 in)
   1097 {
   1098   return (__m256d)in;
   1099 }
   1100 
   1101 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1102 _mm256_castps_si256(__m256 in)
   1103 {
   1104   return (__m256i)in;
   1105 }
   1106 
   1107 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
   1108 _mm256_castsi256_ps(__m256i in)
   1109 {
   1110   return (__m256)in;
   1111 }
   1112 
   1113 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
   1114 _mm256_castsi256_pd(__m256i in)
   1115 {
   1116   return (__m256d)in;
   1117 }
   1118 
   1119 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
   1120 _mm256_castpd256_pd128(__m256d in)
   1121 {
   1122   return __builtin_shufflevector(in, in, 0, 1);
   1123 }
   1124 
   1125 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
   1126 _mm256_castps256_ps128(__m256 in)
   1127 {
   1128   return __builtin_shufflevector(in, in, 0, 1, 2, 3);
   1129 }
   1130 
   1131 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
   1132 _mm256_castsi256_si128(__m256i in)
   1133 {
   1134   return __builtin_shufflevector(in, in, 0, 1);
   1135 }
   1136 
   1137 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
   1138 _mm256_castpd128_pd256(__m128d in)
   1139 {
   1140   __m128d zero = _mm_setzero_pd();
   1141   return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
   1142 }
   1143 
   1144 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
   1145 _mm256_castps128_ps256(__m128 in)
   1146 {
   1147   __m128 zero = _mm_setzero_ps();
   1148   return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
   1149 }
   1150 
   1151 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1152 _mm256_castsi128_si256(__m128i in)
   1153 {
   1154   __m128i zero = _mm_setzero_si128();
   1155   return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
   1156 }
   1157