Home | History | Annotate | Download | only in clang-include
      1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __IMMINTRIN_H
     25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
     26 #endif
     27 
     28 typedef double __v4df __attribute__ ((__vector_size__ (32)));
     29 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
     30 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
     31 typedef int __v8si __attribute__ ((__vector_size__ (32)));
     32 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
     33 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
     34 
     35 typedef float __m256 __attribute__ ((__vector_size__ (32)));
     36 typedef double __m256d __attribute__((__vector_size__(32)));
     37 typedef long long __m256i __attribute__((__vector_size__(32)));
     38 
     39 /* Arithmetic */
     40 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
     41 _mm256_add_pd(__m256d a, __m256d b)
     42 {
     43   return a+b;
     44 }
     45 
     46 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
     47 _mm256_add_ps(__m256 a, __m256 b)
     48 {
     49   return a+b;
     50 }
     51 
     52 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
     53 _mm256_sub_pd(__m256d a, __m256d b)
     54 {
     55   return a-b;
     56 }
     57 
     58 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
     59 _mm256_sub_ps(__m256 a, __m256 b)
     60 {
     61   return a-b;
     62 }
     63 
     64 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
     65 _mm256_addsub_pd(__m256d a, __m256d b)
     66 {
     67   return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
     68 }
     69 
     70 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
     71 _mm256_addsub_ps(__m256 a, __m256 b)
     72 {
     73   return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
     74 }
     75 
     76 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
     77 _mm256_div_pd(__m256d a, __m256d b)
     78 {
     79   return a / b;
     80 }
     81 
     82 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
     83 _mm256_div_ps(__m256 a, __m256 b)
     84 {
     85   return a / b;
     86 }
     87 
     88 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
     89 _mm256_max_pd(__m256d a, __m256d b)
     90 {
     91   return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
     92 }
     93 
     94 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
     95 _mm256_max_ps(__m256 a, __m256 b)
     96 {
     97   return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
     98 }
     99 
    100 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    101 _mm256_min_pd(__m256d a, __m256d b)
    102 {
    103   return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
    104 }
    105 
    106 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    107 _mm256_min_ps(__m256 a, __m256 b)
    108 {
    109   return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
    110 }
    111 
    112 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    113 _mm256_mul_pd(__m256d a, __m256d b)
    114 {
    115   return a * b;
    116 }
    117 
    118 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    119 _mm256_mul_ps(__m256 a, __m256 b)
    120 {
    121   return a * b;
    122 }
    123 
    124 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    125 _mm256_sqrt_pd(__m256d a)
    126 {
    127   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
    128 }
    129 
    130 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    131 _mm256_sqrt_ps(__m256 a)
    132 {
    133   return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
    134 }
    135 
    136 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    137 _mm256_rsqrt_ps(__m256 a)
    138 {
    139   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
    140 }
    141 
    142 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    143 _mm256_rcp_ps(__m256 a)
    144 {
    145   return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
    146 }
    147 
    148 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    149 _mm256_round_pd(__m256d v, const int m)
    150 {
    151   return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
    152 }
    153 
    154 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    155 _mm256_round_ps(__m256 v, const int m)
    156 {
    157   return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
    158 }
    159 
    160 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
    161 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
    162 #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
    163 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
    164 
    165 /* Logical */
    166 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    167 _mm256_and_pd(__m256d a, __m256d b)
    168 {
    169   return (__m256d)((__v4di)a & (__v4di)b);
    170 }
    171 
    172 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    173 _mm256_and_ps(__m256 a, __m256 b)
    174 {
    175   return (__m256)((__v8si)a & (__v8si)b);
    176 }
    177 
    178 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    179 _mm256_andnot_pd(__m256d a, __m256d b)
    180 {
    181   return (__m256d)(~(__v4di)a & (__v4di)b);
    182 }
    183 
    184 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    185 _mm256_andnot_ps(__m256 a, __m256 b)
    186 {
    187   return (__m256)(~(__v8si)a & (__v8si)b);
    188 }
    189 
    190 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    191 _mm256_or_pd(__m256d a, __m256d b)
    192 {
    193   return (__m256d)((__v4di)a | (__v4di)b);
    194 }
    195 
    196 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    197 _mm256_or_ps(__m256 a, __m256 b)
    198 {
    199   return (__m256)((__v8si)a | (__v8si)b);
    200 }
    201 
    202 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    203 _mm256_xor_pd(__m256d a, __m256d b)
    204 {
    205   return (__m256d)((__v4di)a ^ (__v4di)b);
    206 }
    207 
    208 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    209 _mm256_xor_ps(__m256 a, __m256 b)
    210 {
    211   return (__m256)((__v8si)a ^ (__v8si)b);
    212 }
    213 
    214 /* Horizontal arithmetic */
    215 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    216 _mm256_hadd_pd(__m256d a, __m256d b)
    217 {
    218   return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
    219 }
    220 
    221 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    222 _mm256_hadd_ps(__m256 a, __m256 b)
    223 {
    224   return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
    225 }
    226 
    227 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    228 _mm256_hsub_pd(__m256d a, __m256d b)
    229 {
    230   return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
    231 }
    232 
    233 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    234 _mm256_hsub_ps(__m256 a, __m256 b)
    235 {
    236   return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
    237 }
    238 
    239 /* Vector permutations */
    240 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
    241 _mm_permutevar_pd(__m128d a, __m128i c)
    242 {
    243   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
    244 }
    245 
    246 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    247 _mm256_permutevar_pd(__m256d a, __m256i c)
    248 {
    249   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
    250 }
    251 
    252 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    253 _mm_permutevar_ps(__m128 a, __m128i c)
    254 {
    255   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
    256 }
    257 
    258 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    259 _mm256_permutevar_ps(__m256 a, __m256i c)
    260 {
    261   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
    262 						  (__v8si)c);
    263 }
    264 
    265 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
    266 _mm_permute_pd(__m128d a, const int c)
    267 {
    268   return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
    269 }
    270 
    271 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    272 _mm256_permute_pd(__m256d a, const int c)
    273 {
    274   return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
    275 }
    276 
    277 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    278 _mm_permute_ps(__m128 a, const int c)
    279 {
    280   return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
    281 }
    282 
    283 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    284 _mm256_permute_ps(__m256 a, const int c)
    285 {
    286   return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
    287 }
    288 
    289 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    290 _mm256_permute2f128_pd(__m256d a, __m256d b, const int c)
    291 {
    292   return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c);
    293 }
    294 
    295 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    296 _mm256_permute2f128_ps(__m256 a, __m256 b, const int c)
    297 {
    298   return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c);
    299 }
    300 
    301 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    302 _mm256_permute2f128_si256(__m256i a, __m256i b, const int c)
    303 {
    304   return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c);
    305 }
    306 
    307 /* Vector Blend */
    308 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    309 _mm256_blend_pd(__m256d a, __m256d b, const int c)
    310 {
    311   return (__m256d)__builtin_ia32_blendpd256((__v4df)a, (__v4df)b, c);
    312 }
    313 
    314 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    315 _mm256_blend_ps(__m256 a, __m256 b, const int c)
    316 {
    317   return (__m256)__builtin_ia32_blendps256((__v8sf)a, (__v8sf)b, c);
    318 }
    319 
    320 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    321 _mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
    322 {
    323   return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
    324 }
    325 
    326 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    327 _mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
    328 {
    329   return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
    330 }
    331 
    332 /* Vector Dot Product */
    333 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    334 _mm256_dp_ps(__m256 a, __m256 b, const int c)
    335 {
    336   return (__m256)__builtin_ia32_dpps256((__v8sf)a, (__v8sf)b, c);
    337 }
    338 
    339 /* Vector shuffle */
    340 #define _mm256_shuffle_ps(a, b, mask) \
    341         (__builtin_shufflevector((__v8sf)(a), (__v8sf)(b), \
    342         (mask) & 0x3,                ((mask) & 0xc) >> 2, \
    343         (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
    344         (mask) & 0x3 + 4,            (((mask) & 0xc) >> 2) + 4, \
    345         (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12))
    346 
    347 #define _mm256_shuffle_pd(a, b, mask) \
    348         (__builtin_shufflevector((__v4df)(a), (__v4df)(b), \
    349         (mask) & 0x1, \
    350         (((mask) & 0x2) >> 1) + 4, \
    351         (((mask) & 0x4) >> 2) + 2, \
    352         (((mask) & 0x8) >> 3) + 6))
    353 
    354 /* Compare */
    355 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
    356 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
    357 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
    358 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
    359 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
    360 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
    361 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
    362 #define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
    363 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
    364 #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
    365 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
    366 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
    367 #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
    368 #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
    369 #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
    370 #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
    371 #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
    372 #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
    373 #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
    374 #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
    375 #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
    376 #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
    377 #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
    378 #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
    379 #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
    380 #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
    381 #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
    382 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
    383 #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
    384 #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
    385 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
    386 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
    387 
    388 #define _mm_cmp_pd(a, b, c) \
    389   (__m128d)__builtin_ia32_cmppd((__v2df)(a), (__v2df)(b), (c))
    390 
    391 #define _mm_cmp_ps(a, b, c) \
    392   (__m128)__builtin_ia32_cmpps((__v4sf)(a), (__v4sf)(b), (c))
    393 
    394 #define _mm256_cmp_pd(a, b, c) \
    395   (__m256d)__builtin_ia32_cmppd256((__v4df)(a), (__v4df)(b), (c))
    396 
    397 #define _mm256_cmp_ps(a, b, c) \
    398   (__m256)__builtin_ia32_cmpps256((__v8sf)(a), (__v8sf)(b), (c))
    399 
    400 #define _mm_cmp_sd(a, b, c) \
    401   (__m128d)__builtin_ia32_cmpsd((__v2df)(a), (__v2df)(b), (c))
    402 
    403 #define _mm_cmp_ss(a, b, c) \
    404   (__m128)__builtin_ia32_cmpss((__v4sf)(a), (__v4sf)(b), (c))
    405 
    406 /* Vector extract */
    407 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
    408 _mm256_extractf128_pd(__m256d a, const int o)
    409 {
    410   return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
    411 }
    412 
    413 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    414 _mm256_extractf128_ps(__m256 a, const int o)
    415 {
    416   return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
    417 }
    418 
    419 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
    420 _mm256_extractf128_si256(__m256i a, const int o)
    421 {
    422   return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
    423 }
    424 
    425 static __inline int __attribute__((__always_inline__, __nodebug__))
    426 _mm256_extract_epi32(__m256i a, int const imm)
    427 {
    428   __v8si b = (__v8si)a;
    429   return b[imm];
    430 }
    431 
    432 static __inline int __attribute__((__always_inline__, __nodebug__))
    433 _mm256_extract_epi16(__m256i a, int const imm)
    434 {
    435   __v16hi b = (__v16hi)a;
    436   return b[imm];
    437 }
    438 
    439 static __inline int __attribute__((__always_inline__, __nodebug__))
    440 _mm256_extract_epi8(__m256i a, int const imm)
    441 {
    442   __v32qi b = (__v32qi)a;
    443   return b[imm];
    444 }
    445 
    446 #ifdef __x86_64__
    447 static __inline long long  __attribute__((__always_inline__, __nodebug__))
    448 _mm256_extract_epi64(__m256i a, const int imm)
    449 {
    450   __v4di b = (__v4di)a;
    451   return b[imm];
    452 }
    453 #endif
    454 
    455 /* Vector insert */
    456 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    457 _mm256_insertf128_pd(__m256d a, __m128d b, const int o)
    458 {
    459   return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o);
    460 }
    461 
    462 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    463 _mm256_insertf128_ps(__m256 a, __m128 b, const int o)
    464 {
    465   return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o);
    466 }
    467 
    468 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    469 _mm256_insertf128_si256(__m256i a, __m128i b, const int o)
    470 {
    471   return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o);
    472 }
    473 
    474 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    475 _mm256_insert_epi32(__m256i a, int b, int const imm)
    476 {
    477   __v8si c = (__v8si)a;
    478   c[imm & 7] = b;
    479   return (__m256i)c;
    480 }
    481 
    482 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    483 _mm256_insert_epi16(__m256i a, int b, int const imm)
    484 {
    485   __v16hi c = (__v16hi)a;
    486   c[imm & 15] = b;
    487   return (__m256i)c;
    488 }
    489 
    490 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    491 _mm256_insert_epi8(__m256i a, int b, int const imm)
    492 {
    493   __v32qi c = (__v32qi)a;
    494   c[imm & 31] = b;
    495   return (__m256i)c;
    496 }
    497 
    498 #ifdef __x86_64__
    499 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    500 _mm256_insert_epi64(__m256i a, int b, int const imm)
    501 {
    502   __v4di c = (__v4di)a;
    503   c[imm & 3] = b;
    504   return (__m256i)c;
    505 }
    506 #endif
    507 
    508 /* Conversion */
    509 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    510 _mm256_cvtepi32_pd(__m128i a)
    511 {
    512   return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
    513 }
    514 
    515 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    516 _mm256_cvtepi32_ps(__m256i a)
    517 {
    518   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
    519 }
    520 
    521 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    522 _mm256_cvtpd_ps(__m256d a)
    523 {
    524   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
    525 }
    526 
    527 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    528 _mm256_cvtps_epi32(__m256 a)
    529 {
    530   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
    531 }
    532 
    533 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    534 _mm256_cvtps_pd(__m128 a)
    535 {
    536   return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
    537 }
    538 
    539 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
    540 _mm256_cvttpd_epi32(__m256d a)
    541 {
    542   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
    543 }
    544 
    545 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
    546 _mm256_cvtpd_epi32(__m256d a)
    547 {
    548   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
    549 }
    550 
    551 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    552 _mm256_cvttps_epi32(__m256 a)
    553 {
    554   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
    555 }
    556 
    557 /* Vector replicate */
    558 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    559 _mm256_movehdup_ps(__m256 a)
    560 {
    561   return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
    562 }
    563 
    564 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    565 _mm256_moveldup_ps(__m256 a)
    566 {
    567   return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
    568 }
    569 
    570 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    571 _mm256_movedup_pd(__m256d a)
    572 {
    573   return __builtin_shufflevector(a, a, 0, 0, 2, 2);
    574 }
    575 
    576 /* Unpack and Interleave */
    577 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    578 _mm256_unpackhi_pd(__m256d a, __m256d b)
    579 {
    580   return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
    581 }
    582 
    583 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    584 _mm256_unpacklo_pd(__m256d a, __m256d b)
    585 {
    586   return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
    587 }
    588 
    589 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    590 _mm256_unpackhi_ps(__m256 a, __m256 b)
    591 {
    592   return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
    593 }
    594 
    595 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    596 _mm256_unpacklo_ps(__m256 a, __m256 b)
    597 {
    598   return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
    599 }
    600 
    601 /* Bit Test */
    602 static __inline int __attribute__((__always_inline__, __nodebug__))
    603 _mm_testz_pd(__m128d a, __m128d b)
    604 {
    605   return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
    606 }
    607 
    608 static __inline int __attribute__((__always_inline__, __nodebug__))
    609 _mm_testc_pd(__m128d a, __m128d b)
    610 {
    611   return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
    612 }
    613 
    614 static __inline int __attribute__((__always_inline__, __nodebug__))
    615 _mm_testnzc_pd(__m128d a, __m128d b)
    616 {
    617   return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
    618 }
    619 
    620 static __inline int __attribute__((__always_inline__, __nodebug__))
    621 _mm_testz_ps(__m128 a, __m128 b)
    622 {
    623   return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
    624 }
    625 
    626 static __inline int __attribute__((__always_inline__, __nodebug__))
    627 _mm_testc_ps(__m128 a, __m128 b)
    628 {
    629   return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
    630 }
    631 
    632 static __inline int __attribute__((__always_inline__, __nodebug__))
    633 _mm_testnzc_ps(__m128 a, __m128 b)
    634 {
    635   return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
    636 }
    637 
    638 static __inline int __attribute__((__always_inline__, __nodebug__))
    639 _mm256_testz_pd(__m256d a, __m256d b)
    640 {
    641   return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
    642 }
    643 
    644 static __inline int __attribute__((__always_inline__, __nodebug__))
    645 _mm256_testc_pd(__m256d a, __m256d b)
    646 {
    647   return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
    648 }
    649 
    650 static __inline int __attribute__((__always_inline__, __nodebug__))
    651 _mm256_testnzc_pd(__m256d a, __m256d b)
    652 {
    653   return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
    654 }
    655 
    656 static __inline int __attribute__((__always_inline__, __nodebug__))
    657 _mm256_testz_ps(__m256 a, __m256 b)
    658 {
    659   return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
    660 }
    661 
    662 static __inline int __attribute__((__always_inline__, __nodebug__))
    663 _mm256_testc_ps(__m256 a, __m256 b)
    664 {
    665   return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
    666 }
    667 
    668 static __inline int __attribute__((__always_inline__, __nodebug__))
    669 _mm256_testnzc_ps(__m256 a, __m256 b)
    670 {
    671   return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
    672 }
    673 
    674 static __inline int __attribute__((__always_inline__, __nodebug__))
    675 _mm256_testz_si256(__m256i a, __m256i b)
    676 {
    677   return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
    678 }
    679 
    680 static __inline int __attribute__((__always_inline__, __nodebug__))
    681 _mm256_testc_si256(__m256i a, __m256i b)
    682 {
    683   return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
    684 }
    685 
    686 static __inline int __attribute__((__always_inline__, __nodebug__))
    687 _mm256_testnzc_si256(__m256i a, __m256i b)
    688 {
    689   return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
    690 }
    691 
    692 /* Vector extract sign mask */
    693 static __inline int __attribute__((__always_inline__, __nodebug__))
    694 _mm256_movemask_pd(__m256d a)
    695 {
    696   return __builtin_ia32_movmskpd256((__v4df)a);
    697 }
    698 
    699 static __inline int __attribute__((__always_inline__, __nodebug__))
    700 _mm256_movemask_ps(__m256 a)
    701 {
    702   return __builtin_ia32_movmskps256((__v8sf)a);
    703 }
    704 
    705 /* Vector zero */
    706 static __inline void __attribute__((__always_inline__, __nodebug__))
    707 _mm256_zeroall(void)
    708 {
    709   __builtin_ia32_vzeroall();
    710 }
    711 
    712 static __inline void __attribute__((__always_inline__, __nodebug__))
    713 _mm256_zeroupper(void)
    714 {
    715   __builtin_ia32_vzeroupper();
    716 }
    717 
    718 /* Vector load with broadcast */
    719 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    720 _mm_broadcast_ss(float const *a)
    721 {
    722   return (__m128)__builtin_ia32_vbroadcastss(a);
    723 }
    724 
    725 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    726 _mm256_broadcast_sd(double const *a)
    727 {
    728   return (__m256d)__builtin_ia32_vbroadcastsd256(a);
    729 }
    730 
    731 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    732 _mm256_broadcast_ss(float const *a)
    733 {
    734   return (__m256)__builtin_ia32_vbroadcastss256(a);
    735 }
    736 
    737 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    738 _mm256_broadcast_pd(__m128d const *a)
    739 {
    740   return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
    741 }
    742 
    743 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    744 _mm256_broadcast_ps(__m128 const *a)
    745 {
    746   return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
    747 }
    748 
    749 /* SIMD load ops */
    750 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    751 _mm256_load_pd(double const *p)
    752 {
    753   return *(__m256d *)p;
    754 }
    755 
    756 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    757 _mm256_load_ps(float const *p)
    758 {
    759   return *(__m256 *)p;
    760 }
    761 
    762 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    763 _mm256_loadu_pd(double const *p)
    764 {
    765   return (__m256d)__builtin_ia32_loadupd256(p);
    766 }
    767 
    768 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    769 _mm256_loadu_ps(float const *p)
    770 {
    771   return (__m256)__builtin_ia32_loadups256(p);
    772 }
    773 
    774 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    775 _mm256_load_si256(__m256i const *p)
    776 {
    777   return *p;
    778 }
    779 
    780 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    781 _mm256_loadu_si256(__m256i const *p)
    782 {
    783   return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
    784 }
    785 
    786 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    787 _mm256_lddqu_si256(__m256i const *p)
    788 {
    789   return (__m256i)__builtin_ia32_lddqu256((char const *)p);
    790 }
    791 
    792 /* SIMD store ops */
    793 static __inline void __attribute__((__always_inline__, __nodebug__))
    794 _mm256_store_pd(double *p, __m256d a)
    795 {
    796   *(__m256d *)p = a;
    797 }
    798 
    799 static __inline void __attribute__((__always_inline__, __nodebug__))
    800 _mm256_store_ps(float *p, __m256 a)
    801 {
    802   *(__m256 *)p = a;
    803 }
    804 
    805 static __inline void __attribute__((__always_inline__, __nodebug__))
    806 _mm256_storeu_pd(double *p, __m256d a)
    807 {
    808   __builtin_ia32_storeupd256(p, (__v4df)a);
    809 }
    810 
    811 static __inline void __attribute__((__always_inline__, __nodebug__))
    812 _mm256_storeu_ps(float *p, __m256 a)
    813 {
    814   __builtin_ia32_storeups256(p, (__v8sf)a);
    815 }
    816 
    817 static __inline void __attribute__((__always_inline__, __nodebug__))
    818 _mm256_store_si256(__m256i *p, __m256i a)
    819 {
    820   *p = a;
    821 }
    822 
    823 static __inline void __attribute__((__always_inline__, __nodebug__))
    824 _mm256_storeu_si256(__m256i *p, __m256i a)
    825 {
    826   __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
    827 }
    828 
    829 /* Conditional load ops */
    830 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
    831 _mm_maskload_pd(double const *p, __m128d m)
    832 {
    833   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
    834 }
    835 
    836 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    837 _mm256_maskload_pd(double const *p, __m256d m)
    838 {
    839   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
    840 }
    841 
    842 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
    843 _mm_maskload_ps(float const *p, __m128 m)
    844 {
    845   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
    846 }
    847 
    848 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    849 _mm256_maskload_ps(float const *p, __m256 m)
    850 {
    851   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
    852 }
    853 
    854 /* Conditional store ops */
    855 static __inline void __attribute__((__always_inline__, __nodebug__))
    856 _mm256_maskstore_ps(float *p, __m256 m, __m256 a)
    857 {
    858   __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
    859 }
    860 
    861 static __inline void __attribute__((__always_inline__, __nodebug__))
    862 _mm_maskstore_pd(double *p, __m128d m, __m128d a)
    863 {
    864   __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
    865 }
    866 
    867 static __inline void __attribute__((__always_inline__, __nodebug__))
    868 _mm256_maskstore_pd(double *p, __m256d m, __m256d a)
    869 {
    870   __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
    871 }
    872 
    873 static __inline void __attribute__((__always_inline__, __nodebug__))
    874 _mm_maskstore_ps(float *p, __m128 m, __m128 a)
    875 {
    876   __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
    877 }
    878 
    879 /* Cacheability support ops */
    880 static __inline void __attribute__((__always_inline__, __nodebug__))
    881 _mm256_stream_si256(__m256i *a, __m256i b)
    882 {
    883   __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
    884 }
    885 
    886 static __inline void __attribute__((__always_inline__, __nodebug__))
    887 _mm256_stream_pd(double *a, __m256d b)
    888 {
    889   __builtin_ia32_movntpd256(a, (__v4df)b);
    890 }
    891 
    892 static __inline void __attribute__((__always_inline__, __nodebug__))
    893 _mm256_stream_ps(float *p, __m256 a)
    894 {
    895   __builtin_ia32_movntps256(p, (__v8sf)a);
    896 }
    897 
    898 /* Create vectors */
    899 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    900 _mm256_set_pd(double a, double b, double c, double d)
    901 {
    902   return (__m256d){ d, c, b, a };
    903 }
    904 
    905 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    906 _mm256_set_ps(float a, float b, float c, float d,
    907 	            float e, float f, float g, float h)
    908 {
    909   return (__m256){ h, g, f, e, d, c, b, a };
    910 }
    911 
    912 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    913 _mm256_set_epi32(int i0, int i1, int i2, int i3,
    914 		             int i4, int i5, int i6, int i7)
    915 {
    916   return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
    917 }
    918 
    919 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    920 _mm256_set_epi16(short w15, short w14, short w13, short w12,
    921 		             short w11, short w10, short w09, short w08,
    922 		             short w07, short w06, short w05, short w04,
    923 		             short w03, short w02, short w01, short w00)
    924 {
    925   return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
    926                              w08, w09, w10, w11, w12, w13, w14, w15 };
    927 }
    928 
    929 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    930 _mm256_set_epi8(char b31, char b30, char b29, char b28,
    931 		            char b27, char b26, char b25, char b24,
    932 		            char b23, char b22, char b21, char b20,
    933 		            char b19, char b18, char b17, char b16,
    934 		            char b15, char b14, char b13, char b12,
    935 		            char b11, char b10, char b09, char b08,
    936 		            char b07, char b06, char b05, char b04,
    937 		            char b03, char b02, char b01, char b00)
    938 {
    939   return (__m256i)(__v32qi){
    940     b00, b01, b02, b03, b04, b05, b06, b07,
    941     b08, b09, b10, b11, b12, b13, b14, b15,
    942     b16, b17, b18, b19, b20, b21, b22, b23,
    943     b24, b25, b26, b27, b28, b29, b30, b31
    944   };
    945 }
    946 
    947 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    948 _mm256_set_epi64x(long long a, long long b, long long c, long long d)
    949 {
    950   return (__m256i)(__v4di){ d, c, b, a };
    951 }
    952 
    953 /* Create vectors with elements in reverse order */
    954 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
    955 _mm256_setr_pd(double a, double b, double c, double d)
    956 {
    957   return (__m256d){ a, b, c, d };
    958 }
    959 
    960 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
    961 _mm256_setr_ps(float a, float b, float c, float d,
    962 		           float e, float f, float g, float h)
    963 {
    964   return (__m256){ a, b, c, d, e, f, g, h };
    965 }
    966 
    967 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    968 _mm256_setr_epi32(int i0, int i1, int i2, int i3,
    969 		              int i4, int i5, int i6, int i7)
    970 {
    971   return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
    972 }
    973 
    974 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    975 _mm256_setr_epi16(short w15, short w14, short w13, short w12,
    976 		   short w11, short w10, short w09, short w08,
    977 		   short w07, short w06, short w05, short w04,
    978 		   short w03, short w02, short w01, short w00)
    979 {
    980   return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
    981 			                       w07, w06, w05, w04, w03, w02, w01, w00 };
    982 }
    983 
    984 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
    985 _mm256_setr_epi8(char b31, char b30, char b29, char b28,
    986 		             char b27, char b26, char b25, char b24,
    987 		             char b23, char b22, char b21, char b20,
    988 		             char b19, char b18, char b17, char b16,
    989 		             char b15, char b14, char b13, char b12,
    990 		             char b11, char b10, char b09, char b08,
    991 		             char b07, char b06, char b05, char b04,
    992 		             char b03, char b02, char b01, char b00)
    993 {
    994   return (__m256i)(__v32qi){
    995     b31, b30, b29, b28, b27, b26, b25, b24,
    996 		b23, b22, b21, b20, b19, b18, b17, b16,
    997 		b15, b14, b13, b12, b11, b10, b09, b08,
    998 		b07, b06, b05, b04, b03, b02, b01, b00 };
    999 }
   1000 
   1001 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1002 _mm256_setr_epi64x(long long a, long long b, long long c, long long d)
   1003 {
   1004   return (__m256i)(__v4di){ a, b, c, d };
   1005 }
   1006 
   1007 /* Create vectors with repeated elements */
   1008 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
   1009 _mm256_set1_pd(double w)
   1010 {
   1011   return (__m256d){ w, w, w, w };
   1012 }
   1013 
   1014 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
   1015 _mm256_set1_ps(float w)
   1016 {
   1017   return (__m256){ w, w, w, w, w, w, w, w };
   1018 }
   1019 
   1020 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1021 _mm256_set1_epi32(int i)
   1022 {
   1023   return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
   1024 }
   1025 
   1026 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1027 _mm256_set1_epi16(short w)
   1028 {
   1029   return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
   1030 }
   1031 
   1032 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1033 _mm256_set1_epi8(char b)
   1034 {
   1035   return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
   1036                              b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
   1037 }
   1038 
   1039 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1040 _mm256_set1_epi64x(long long q)
   1041 {
   1042   return (__m256i)(__v4di){ q, q, q, q };
   1043 }
   1044 
   1045 /* Create zeroed vectors */
   1046 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
   1047 _mm256_setzero_pd(void)
   1048 {
   1049   return (__m256d){ 0, 0, 0, 0 };
   1050 }
   1051 
   1052 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
   1053 _mm256_setzero_ps(void)
   1054 {
   1055   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
   1056 }
   1057 
   1058 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1059 _mm256_setzero_si256(void)
   1060 {
   1061   return (__m256i){ 0LL, 0LL, 0LL, 0LL };
   1062 }
   1063 
   1064 /* Cast between vector types */
   1065 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
   1066 _mm256_castpd_ps(__m256d in)
   1067 {
   1068   return (__m256)in;
   1069 }
   1070 
   1071 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1072 _mm256_castpd_si256(__m256d in)
   1073 {
   1074   return (__m256i)in;
   1075 }
   1076 
   1077 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
   1078 _mm256_castps_pd(__m256 in)
   1079 {
   1080   return (__m256d)in;
   1081 }
   1082 
   1083 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1084 _mm256_castps_si256(__m256 in)
   1085 {
   1086   return (__m256i)in;
   1087 }
   1088 
   1089 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
   1090 _mm256_castsi256_ps(__m256i in)
   1091 {
   1092   return (__m256)in;
   1093 }
   1094 
   1095 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
   1096 _mm256_castsi256_pd(__m256i in)
   1097 {
   1098   return (__m256d)in;
   1099 }
   1100 
   1101 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
   1102 _mm256_castpd256_pd128(__m256d in)
   1103 {
   1104   return __builtin_shufflevector(in, in, 0, 1);
   1105 }
   1106 
   1107 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
   1108 _mm256_castps256_ps128(__m256 in)
   1109 {
   1110   return __builtin_shufflevector(in, in, 0, 1, 2, 3);
   1111 }
   1112 
   1113 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
   1114 _mm256_castsi256_si128(__m256i in)
   1115 {
   1116   return __builtin_shufflevector(in, in, 0, 1);
   1117 }
   1118 
   1119 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
   1120 _mm256_castpd128_pd256(__m128d in)
   1121 {
   1122   __m128d zero = _mm_setzero_pd();
   1123   return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
   1124 }
   1125 
   1126 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
   1127 _mm256_castps128_ps256(__m128 in)
   1128 {
   1129   __m128 zero = _mm_setzero_ps();
   1130   return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
   1131 }
   1132 
   1133 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
   1134 _mm256_castsi128_si256(__m128i in)
   1135 {
   1136   __m128i zero = _mm_setzero_si128();
   1137   return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
   1138 }
   1139