Home | History | Annotate | Download | only in Headers
      1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __EMMINTRIN_H
     25 #define __EMMINTRIN_H
     26 
     27 #include <xmmintrin.h>
     28 
     29 typedef double __m128d __attribute__((__vector_size__(16)));
     30 typedef long long __m128i __attribute__((__vector_size__(16)));
     31 
     32 /* Type defines.  */
     33 typedef double __v2df __attribute__ ((__vector_size__ (16)));
     34 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
     35 typedef short __v8hi __attribute__((__vector_size__(16)));
     36 typedef char __v16qi __attribute__((__vector_size__(16)));
     37 
     38 /* Unsigned types */
     39 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
     40 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
     41 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
     42 
     43 /* We need an explicitly signed variant for char. Note that this shouldn't
     44  * appear in the interface though. */
     45 typedef signed char __v16qs __attribute__((__vector_size__(16)));
     46 
     47 #include <f16cintrin.h>
     48 
     49 /* Define the default attributes for the functions in this file. */
     50 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
     51 
     52 static __inline__ __m128d __DEFAULT_FN_ATTRS
     53 _mm_add_sd(__m128d __a, __m128d __b)
     54 {
     55   __a[0] += __b[0];
     56   return __a;
     57 }
     58 
     59 static __inline__ __m128d __DEFAULT_FN_ATTRS
     60 _mm_add_pd(__m128d __a, __m128d __b)
     61 {
     62   return (__m128d)((__v2df)__a + (__v2df)__b);
     63 }
     64 
     65 static __inline__ __m128d __DEFAULT_FN_ATTRS
     66 _mm_sub_sd(__m128d __a, __m128d __b)
     67 {
     68   __a[0] -= __b[0];
     69   return __a;
     70 }
     71 
     72 static __inline__ __m128d __DEFAULT_FN_ATTRS
     73 _mm_sub_pd(__m128d __a, __m128d __b)
     74 {
     75   return (__m128d)((__v2df)__a - (__v2df)__b);
     76 }
     77 
     78 static __inline__ __m128d __DEFAULT_FN_ATTRS
     79 _mm_mul_sd(__m128d __a, __m128d __b)
     80 {
     81   __a[0] *= __b[0];
     82   return __a;
     83 }
     84 
     85 static __inline__ __m128d __DEFAULT_FN_ATTRS
     86 _mm_mul_pd(__m128d __a, __m128d __b)
     87 {
     88   return (__m128d)((__v2df)__a * (__v2df)__b);
     89 }
     90 
     91 static __inline__ __m128d __DEFAULT_FN_ATTRS
     92 _mm_div_sd(__m128d __a, __m128d __b)
     93 {
     94   __a[0] /= __b[0];
     95   return __a;
     96 }
     97 
     98 static __inline__ __m128d __DEFAULT_FN_ATTRS
     99 _mm_div_pd(__m128d __a, __m128d __b)
    100 {
    101   return (__m128d)((__v2df)__a / (__v2df)__b);
    102 }
    103 
    104 static __inline__ __m128d __DEFAULT_FN_ATTRS
    105 _mm_sqrt_sd(__m128d __a, __m128d __b)
    106 {
    107   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
    108   return (__m128d) { __c[0], __a[1] };
    109 }
    110 
    111 static __inline__ __m128d __DEFAULT_FN_ATTRS
    112 _mm_sqrt_pd(__m128d __a)
    113 {
    114   return __builtin_ia32_sqrtpd((__v2df)__a);
    115 }
    116 
    117 static __inline__ __m128d __DEFAULT_FN_ATTRS
    118 _mm_min_sd(__m128d __a, __m128d __b)
    119 {
    120   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
    121 }
    122 
    123 static __inline__ __m128d __DEFAULT_FN_ATTRS
    124 _mm_min_pd(__m128d __a, __m128d __b)
    125 {
    126   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
    127 }
    128 
    129 static __inline__ __m128d __DEFAULT_FN_ATTRS
    130 _mm_max_sd(__m128d __a, __m128d __b)
    131 {
    132   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
    133 }
    134 
    135 static __inline__ __m128d __DEFAULT_FN_ATTRS
    136 _mm_max_pd(__m128d __a, __m128d __b)
    137 {
    138   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
    139 }
    140 
    141 static __inline__ __m128d __DEFAULT_FN_ATTRS
    142 _mm_and_pd(__m128d __a, __m128d __b)
    143 {
    144   return (__m128d)((__v4su)__a & (__v4su)__b);
    145 }
    146 
    147 static __inline__ __m128d __DEFAULT_FN_ATTRS
    148 _mm_andnot_pd(__m128d __a, __m128d __b)
    149 {
    150   return (__m128d)(~(__v4su)__a & (__v4su)__b);
    151 }
    152 
    153 static __inline__ __m128d __DEFAULT_FN_ATTRS
    154 _mm_or_pd(__m128d __a, __m128d __b)
    155 {
    156   return (__m128d)((__v4su)__a | (__v4su)__b);
    157 }
    158 
    159 static __inline__ __m128d __DEFAULT_FN_ATTRS
    160 _mm_xor_pd(__m128d __a, __m128d __b)
    161 {
    162   return (__m128d)((__v4su)__a ^ (__v4su)__b);
    163 }
    164 
    165 static __inline__ __m128d __DEFAULT_FN_ATTRS
    166 _mm_cmpeq_pd(__m128d __a, __m128d __b)
    167 {
    168   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
    169 }
    170 
    171 static __inline__ __m128d __DEFAULT_FN_ATTRS
    172 _mm_cmplt_pd(__m128d __a, __m128d __b)
    173 {
    174   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
    175 }
    176 
    177 static __inline__ __m128d __DEFAULT_FN_ATTRS
    178 _mm_cmple_pd(__m128d __a, __m128d __b)
    179 {
    180   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
    181 }
    182 
    183 static __inline__ __m128d __DEFAULT_FN_ATTRS
    184 _mm_cmpgt_pd(__m128d __a, __m128d __b)
    185 {
    186   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
    187 }
    188 
    189 static __inline__ __m128d __DEFAULT_FN_ATTRS
    190 _mm_cmpge_pd(__m128d __a, __m128d __b)
    191 {
    192   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
    193 }
    194 
    195 static __inline__ __m128d __DEFAULT_FN_ATTRS
    196 _mm_cmpord_pd(__m128d __a, __m128d __b)
    197 {
    198   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
    199 }
    200 
    201 static __inline__ __m128d __DEFAULT_FN_ATTRS
    202 _mm_cmpunord_pd(__m128d __a, __m128d __b)
    203 {
    204   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
    205 }
    206 
    207 static __inline__ __m128d __DEFAULT_FN_ATTRS
    208 _mm_cmpneq_pd(__m128d __a, __m128d __b)
    209 {
    210   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
    211 }
    212 
    213 static __inline__ __m128d __DEFAULT_FN_ATTRS
    214 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
    215 {
    216   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
    217 }
    218 
    219 static __inline__ __m128d __DEFAULT_FN_ATTRS
    220 _mm_cmpnle_pd(__m128d __a, __m128d __b)
    221 {
    222   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
    223 }
    224 
    225 static __inline__ __m128d __DEFAULT_FN_ATTRS
    226 _mm_cmpngt_pd(__m128d __a, __m128d __b)
    227 {
    228   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
    229 }
    230 
    231 static __inline__ __m128d __DEFAULT_FN_ATTRS
    232 _mm_cmpnge_pd(__m128d __a, __m128d __b)
    233 {
    234   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
    235 }
    236 
    237 static __inline__ __m128d __DEFAULT_FN_ATTRS
    238 _mm_cmpeq_sd(__m128d __a, __m128d __b)
    239 {
    240   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
    241 }
    242 
    243 static __inline__ __m128d __DEFAULT_FN_ATTRS
    244 _mm_cmplt_sd(__m128d __a, __m128d __b)
    245 {
    246   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
    247 }
    248 
    249 static __inline__ __m128d __DEFAULT_FN_ATTRS
    250 _mm_cmple_sd(__m128d __a, __m128d __b)
    251 {
    252   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
    253 }
    254 
    255 static __inline__ __m128d __DEFAULT_FN_ATTRS
    256 _mm_cmpgt_sd(__m128d __a, __m128d __b)
    257 {
    258   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
    259   return (__m128d) { __c[0], __a[1] };
    260 }
    261 
    262 static __inline__ __m128d __DEFAULT_FN_ATTRS
    263 _mm_cmpge_sd(__m128d __a, __m128d __b)
    264 {
    265   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
    266   return (__m128d) { __c[0], __a[1] };
    267 }
    268 
    269 static __inline__ __m128d __DEFAULT_FN_ATTRS
    270 _mm_cmpord_sd(__m128d __a, __m128d __b)
    271 {
    272   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
    273 }
    274 
    275 static __inline__ __m128d __DEFAULT_FN_ATTRS
    276 _mm_cmpunord_sd(__m128d __a, __m128d __b)
    277 {
    278   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
    279 }
    280 
    281 static __inline__ __m128d __DEFAULT_FN_ATTRS
    282 _mm_cmpneq_sd(__m128d __a, __m128d __b)
    283 {
    284   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
    285 }
    286 
    287 static __inline__ __m128d __DEFAULT_FN_ATTRS
    288 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
    289 {
    290   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
    291 }
    292 
    293 static __inline__ __m128d __DEFAULT_FN_ATTRS
    294 _mm_cmpnle_sd(__m128d __a, __m128d __b)
    295 {
    296   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
    297 }
    298 
    299 static __inline__ __m128d __DEFAULT_FN_ATTRS
    300 _mm_cmpngt_sd(__m128d __a, __m128d __b)
    301 {
    302   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
    303   return (__m128d) { __c[0], __a[1] };
    304 }
    305 
    306 static __inline__ __m128d __DEFAULT_FN_ATTRS
    307 _mm_cmpnge_sd(__m128d __a, __m128d __b)
    308 {
    309   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
    310   return (__m128d) { __c[0], __a[1] };
    311 }
    312 
    313 static __inline__ int __DEFAULT_FN_ATTRS
    314 _mm_comieq_sd(__m128d __a, __m128d __b)
    315 {
    316   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
    317 }
    318 
    319 static __inline__ int __DEFAULT_FN_ATTRS
    320 _mm_comilt_sd(__m128d __a, __m128d __b)
    321 {
    322   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
    323 }
    324 
    325 static __inline__ int __DEFAULT_FN_ATTRS
    326 _mm_comile_sd(__m128d __a, __m128d __b)
    327 {
    328   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
    329 }
    330 
    331 static __inline__ int __DEFAULT_FN_ATTRS
    332 _mm_comigt_sd(__m128d __a, __m128d __b)
    333 {
    334   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
    335 }
    336 
    337 static __inline__ int __DEFAULT_FN_ATTRS
    338 _mm_comige_sd(__m128d __a, __m128d __b)
    339 {
    340   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
    341 }
    342 
    343 static __inline__ int __DEFAULT_FN_ATTRS
    344 _mm_comineq_sd(__m128d __a, __m128d __b)
    345 {
    346   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
    347 }
    348 
    349 static __inline__ int __DEFAULT_FN_ATTRS
    350 _mm_ucomieq_sd(__m128d __a, __m128d __b)
    351 {
    352   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
    353 }
    354 
    355 static __inline__ int __DEFAULT_FN_ATTRS
    356 _mm_ucomilt_sd(__m128d __a, __m128d __b)
    357 {
    358   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
    359 }
    360 
    361 static __inline__ int __DEFAULT_FN_ATTRS
    362 _mm_ucomile_sd(__m128d __a, __m128d __b)
    363 {
    364   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
    365 }
    366 
    367 static __inline__ int __DEFAULT_FN_ATTRS
    368 _mm_ucomigt_sd(__m128d __a, __m128d __b)
    369 {
    370   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
    371 }
    372 
    373 static __inline__ int __DEFAULT_FN_ATTRS
    374 _mm_ucomige_sd(__m128d __a, __m128d __b)
    375 {
    376   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
    377 }
    378 
    379 static __inline__ int __DEFAULT_FN_ATTRS
    380 _mm_ucomineq_sd(__m128d __a, __m128d __b)
    381 {
    382   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
    383 }
    384 
    385 static __inline__ __m128 __DEFAULT_FN_ATTRS
    386 _mm_cvtpd_ps(__m128d __a)
    387 {
    388   return __builtin_ia32_cvtpd2ps((__v2df)__a);
    389 }
    390 
    391 static __inline__ __m128d __DEFAULT_FN_ATTRS
    392 _mm_cvtps_pd(__m128 __a)
    393 {
    394   return (__m128d) __builtin_convertvector(
    395       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
    396 }
    397 
    398 static __inline__ __m128d __DEFAULT_FN_ATTRS
    399 _mm_cvtepi32_pd(__m128i __a)
    400 {
    401   return (__m128d) __builtin_convertvector(
    402       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
    403 }
    404 
    405 static __inline__ __m128i __DEFAULT_FN_ATTRS
    406 _mm_cvtpd_epi32(__m128d __a)
    407 {
    408   return __builtin_ia32_cvtpd2dq((__v2df)__a);
    409 }
    410 
    411 static __inline__ int __DEFAULT_FN_ATTRS
    412 _mm_cvtsd_si32(__m128d __a)
    413 {
    414   return __builtin_ia32_cvtsd2si((__v2df)__a);
    415 }
    416 
    417 static __inline__ __m128 __DEFAULT_FN_ATTRS
    418 _mm_cvtsd_ss(__m128 __a, __m128d __b)
    419 {
    420   __a[0] = __b[0];
    421   return __a;
    422 }
    423 
    424 static __inline__ __m128d __DEFAULT_FN_ATTRS
    425 _mm_cvtsi32_sd(__m128d __a, int __b)
    426 {
    427   __a[0] = __b;
    428   return __a;
    429 }
    430 
    431 static __inline__ __m128d __DEFAULT_FN_ATTRS
    432 _mm_cvtss_sd(__m128d __a, __m128 __b)
    433 {
    434   __a[0] = __b[0];
    435   return __a;
    436 }
    437 
    438 static __inline__ __m128i __DEFAULT_FN_ATTRS
    439 _mm_cvttpd_epi32(__m128d __a)
    440 {
    441   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
    442 }
    443 
    444 static __inline__ int __DEFAULT_FN_ATTRS
    445 _mm_cvttsd_si32(__m128d __a)
    446 {
    447   return __a[0];
    448 }
    449 
    450 static __inline__ __m64 __DEFAULT_FN_ATTRS
    451 _mm_cvtpd_pi32(__m128d __a)
    452 {
    453   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
    454 }
    455 
    456 static __inline__ __m64 __DEFAULT_FN_ATTRS
    457 _mm_cvttpd_pi32(__m128d __a)
    458 {
    459   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
    460 }
    461 
    462 static __inline__ __m128d __DEFAULT_FN_ATTRS
    463 _mm_cvtpi32_pd(__m64 __a)
    464 {
    465   return __builtin_ia32_cvtpi2pd((__v2si)__a);
    466 }
    467 
    468 static __inline__ double __DEFAULT_FN_ATTRS
    469 _mm_cvtsd_f64(__m128d __a)
    470 {
    471   return __a[0];
    472 }
    473 
    474 static __inline__ __m128d __DEFAULT_FN_ATTRS
    475 _mm_load_pd(double const *__dp)
    476 {
    477   return *(__m128d*)__dp;
    478 }
    479 
    480 static __inline__ __m128d __DEFAULT_FN_ATTRS
    481 _mm_load1_pd(double const *__dp)
    482 {
    483   struct __mm_load1_pd_struct {
    484     double __u;
    485   } __attribute__((__packed__, __may_alias__));
    486   double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
    487   return (__m128d){ __u, __u };
    488 }
    489 
    490 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
    491 
    492 static __inline__ __m128d __DEFAULT_FN_ATTRS
    493 _mm_loadr_pd(double const *__dp)
    494 {
    495   __m128d __u = *(__m128d*)__dp;
    496   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
    497 }
    498 
    499 static __inline__ __m128d __DEFAULT_FN_ATTRS
    500 _mm_loadu_pd(double const *__dp)
    501 {
    502   struct __loadu_pd {
    503     __m128d __v;
    504   } __attribute__((__packed__, __may_alias__));
    505   return ((struct __loadu_pd*)__dp)->__v;
    506 }
    507 
    508 static __inline__ __m128i __DEFAULT_FN_ATTRS
    509 _mm_loadu_si64(void const *__a)
    510 {
    511   struct __loadu_si64 {
    512     long long __v;
    513   } __attribute__((__packed__, __may_alias__));
    514   long long __u = ((struct __loadu_si64*)__a)->__v;
    515   return (__m128i){__u, 0L};
    516 }
    517 
    518 static __inline__ __m128d __DEFAULT_FN_ATTRS
    519 _mm_load_sd(double const *__dp)
    520 {
    521   struct __mm_load_sd_struct {
    522     double __u;
    523   } __attribute__((__packed__, __may_alias__));
    524   double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
    525   return (__m128d){ __u, 0 };
    526 }
    527 
    528 static __inline__ __m128d __DEFAULT_FN_ATTRS
    529 _mm_loadh_pd(__m128d __a, double const *__dp)
    530 {
    531   struct __mm_loadh_pd_struct {
    532     double __u;
    533   } __attribute__((__packed__, __may_alias__));
    534   double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
    535   return (__m128d){ __a[0], __u };
    536 }
    537 
    538 static __inline__ __m128d __DEFAULT_FN_ATTRS
    539 _mm_loadl_pd(__m128d __a, double const *__dp)
    540 {
    541   struct __mm_loadl_pd_struct {
    542     double __u;
    543   } __attribute__((__packed__, __may_alias__));
    544   double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
    545   return (__m128d){ __u, __a[1] };
    546 }
    547 
    548 static __inline__ __m128d __DEFAULT_FN_ATTRS
    549 _mm_undefined_pd(void)
    550 {
    551   return (__m128d)__builtin_ia32_undef128();
    552 }
    553 
    554 static __inline__ __m128d __DEFAULT_FN_ATTRS
    555 _mm_set_sd(double __w)
    556 {
    557   return (__m128d){ __w, 0 };
    558 }
    559 
    560 static __inline__ __m128d __DEFAULT_FN_ATTRS
    561 _mm_set1_pd(double __w)
    562 {
    563   return (__m128d){ __w, __w };
    564 }
    565 
    566 static __inline__ __m128d __DEFAULT_FN_ATTRS
    567 _mm_set_pd(double __w, double __x)
    568 {
    569   return (__m128d){ __x, __w };
    570 }
    571 
    572 static __inline__ __m128d __DEFAULT_FN_ATTRS
    573 _mm_setr_pd(double __w, double __x)
    574 {
    575   return (__m128d){ __w, __x };
    576 }
    577 
    578 static __inline__ __m128d __DEFAULT_FN_ATTRS
    579 _mm_setzero_pd(void)
    580 {
    581   return (__m128d){ 0, 0 };
    582 }
    583 
    584 static __inline__ __m128d __DEFAULT_FN_ATTRS
    585 _mm_move_sd(__m128d __a, __m128d __b)
    586 {
    587   return (__m128d){ __b[0], __a[1] };
    588 }
    589 
    590 static __inline__ void __DEFAULT_FN_ATTRS
    591 _mm_store_sd(double *__dp, __m128d __a)
    592 {
    593   struct __mm_store_sd_struct {
    594     double __u;
    595   } __attribute__((__packed__, __may_alias__));
    596   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
    597 }
    598 
    599 static __inline__ void __DEFAULT_FN_ATTRS
    600 _mm_store_pd(double *__dp, __m128d __a)
    601 {
    602   *(__m128d*)__dp = __a;
    603 }
    604 
    605 static __inline__ void __DEFAULT_FN_ATTRS
    606 _mm_store1_pd(double *__dp, __m128d __a)
    607 {
    608   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
    609   _mm_store_pd(__dp, __a);
    610 }
    611 
    612 static __inline__ void __DEFAULT_FN_ATTRS
    613 _mm_store_pd1(double *__dp, __m128d __a)
    614 {
    615   return _mm_store1_pd(__dp, __a);
    616 }
    617 
    618 static __inline__ void __DEFAULT_FN_ATTRS
    619 _mm_storeu_pd(double *__dp, __m128d __a)
    620 {
    621   struct __storeu_pd {
    622     __m128d __v;
    623   } __attribute__((__packed__, __may_alias__));
    624   ((struct __storeu_pd*)__dp)->__v = __a;
    625 }
    626 
    627 static __inline__ void __DEFAULT_FN_ATTRS
    628 _mm_storer_pd(double *__dp, __m128d __a)
    629 {
    630   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
    631   *(__m128d *)__dp = __a;
    632 }
    633 
    634 static __inline__ void __DEFAULT_FN_ATTRS
    635 _mm_storeh_pd(double *__dp, __m128d __a)
    636 {
    637   struct __mm_storeh_pd_struct {
    638     double __u;
    639   } __attribute__((__packed__, __may_alias__));
    640   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
    641 }
    642 
    643 static __inline__ void __DEFAULT_FN_ATTRS
    644 _mm_storel_pd(double *__dp, __m128d __a)
    645 {
    646   struct __mm_storeh_pd_struct {
    647     double __u;
    648   } __attribute__((__packed__, __may_alias__));
    649   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
    650 }
    651 
    652 static __inline__ __m128i __DEFAULT_FN_ATTRS
    653 _mm_add_epi8(__m128i __a, __m128i __b)
    654 {
    655   return (__m128i)((__v16qu)__a + (__v16qu)__b);
    656 }
    657 
    658 static __inline__ __m128i __DEFAULT_FN_ATTRS
    659 _mm_add_epi16(__m128i __a, __m128i __b)
    660 {
    661   return (__m128i)((__v8hu)__a + (__v8hu)__b);
    662 }
    663 
    664 static __inline__ __m128i __DEFAULT_FN_ATTRS
    665 _mm_add_epi32(__m128i __a, __m128i __b)
    666 {
    667   return (__m128i)((__v4su)__a + (__v4su)__b);
    668 }
    669 
    670 static __inline__ __m64 __DEFAULT_FN_ATTRS
    671 _mm_add_si64(__m64 __a, __m64 __b)
    672 {
    673   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
    674 }
    675 
    676 static __inline__ __m128i __DEFAULT_FN_ATTRS
    677 _mm_add_epi64(__m128i __a, __m128i __b)
    678 {
    679   return (__m128i)((__v2du)__a + (__v2du)__b);
    680 }
    681 
    682 static __inline__ __m128i __DEFAULT_FN_ATTRS
    683 _mm_adds_epi8(__m128i __a, __m128i __b)
    684 {
    685   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
    686 }
    687 
    688 static __inline__ __m128i __DEFAULT_FN_ATTRS
    689 _mm_adds_epi16(__m128i __a, __m128i __b)
    690 {
    691   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
    692 }
    693 
    694 static __inline__ __m128i __DEFAULT_FN_ATTRS
    695 _mm_adds_epu8(__m128i __a, __m128i __b)
    696 {
    697   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
    698 }
    699 
    700 static __inline__ __m128i __DEFAULT_FN_ATTRS
    701 _mm_adds_epu16(__m128i __a, __m128i __b)
    702 {
    703   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
    704 }
    705 
    706 static __inline__ __m128i __DEFAULT_FN_ATTRS
    707 _mm_avg_epu8(__m128i __a, __m128i __b)
    708 {
    709   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
    710 }
    711 
    712 static __inline__ __m128i __DEFAULT_FN_ATTRS
    713 _mm_avg_epu16(__m128i __a, __m128i __b)
    714 {
    715   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
    716 }
    717 
    718 static __inline__ __m128i __DEFAULT_FN_ATTRS
    719 _mm_madd_epi16(__m128i __a, __m128i __b)
    720 {
    721   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
    722 }
    723 
    724 static __inline__ __m128i __DEFAULT_FN_ATTRS
    725 _mm_max_epi16(__m128i __a, __m128i __b)
    726 {
    727   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
    728 }
    729 
    730 static __inline__ __m128i __DEFAULT_FN_ATTRS
    731 _mm_max_epu8(__m128i __a, __m128i __b)
    732 {
    733   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
    734 }
    735 
    736 static __inline__ __m128i __DEFAULT_FN_ATTRS
    737 _mm_min_epi16(__m128i __a, __m128i __b)
    738 {
    739   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
    740 }
    741 
    742 static __inline__ __m128i __DEFAULT_FN_ATTRS
    743 _mm_min_epu8(__m128i __a, __m128i __b)
    744 {
    745   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
    746 }
    747 
    748 static __inline__ __m128i __DEFAULT_FN_ATTRS
    749 _mm_mulhi_epi16(__m128i __a, __m128i __b)
    750 {
    751   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
    752 }
    753 
    754 static __inline__ __m128i __DEFAULT_FN_ATTRS
    755 _mm_mulhi_epu16(__m128i __a, __m128i __b)
    756 {
    757   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
    758 }
    759 
    760 /// \brief Multiplies the corresponding elements of two [8 x short] vectors and
    761 ///    returns a vector containing the low-order 16 bits of each 32-bit product
    762 ///    in the corresponding element.
    763 ///
    764 /// \headerfile <x86intrin.h>
    765 ///
    766 /// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
    767 ///
    768 /// \param __a
    769 ///    A 128-bit integer vector containing one of the source operands.
    770 /// \param __b
    771 ///    A 128-bit integer vector containing one of the source operands.
    772 /// \returns A 128-bit integer vector containing the products of both operands.
    773 static __inline__ __m128i __DEFAULT_FN_ATTRS
    774 _mm_mullo_epi16(__m128i __a, __m128i __b)
    775 {
    776   return (__m128i)((__v8hu)__a * (__v8hu)__b);
    777 }
    778 
    779 /// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
    780 ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
    781 ///    product.
    782 ///
    783 /// \headerfile <x86intrin.h>
    784 ///
    785 /// This intrinsic corresponds to the \c PMULUDQ instruction.
    786 ///
    787 /// \param __a
    788 ///    A 64-bit integer containing one of the source operands.
    789 /// \param __b
    790 ///    A 64-bit integer containing one of the source operands.
    791 /// \returns A 64-bit integer vector containing the product of both operands.
    792 static __inline__ __m64 __DEFAULT_FN_ATTRS
    793 _mm_mul_su32(__m64 __a, __m64 __b)
    794 {
    795   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
    796 }
    797 
    798 /// \brief Multiplies 32-bit unsigned integer values contained in the lower
    799 ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
    800 ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
    801 ///
    802 /// \headerfile <x86intrin.h>
    803 ///
    804 /// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
    805 ///
    806 /// \param __a
    807 ///    A [2 x i64] vector containing one of the source operands.
    808 /// \param __b
    809 ///    A [2 x i64] vector containing one of the source operands.
    810 /// \returns A [2 x i64] vector containing the product of both operands.
    811 static __inline__ __m128i __DEFAULT_FN_ATTRS
    812 _mm_mul_epu32(__m128i __a, __m128i __b)
    813 {
    814   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
    815 }
    816 
    817 /// \brief Computes the absolute differences of corresponding 8-bit integer
    818 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
    819 ///    separately sums the second 8 absolute differences. Packss these two
    820 ///    unsigned 16-bit integer sums into the upper and lower elements of a
    821 ///    [2 x i64] vector.
    822 ///
    823 /// \headerfile <x86intrin.h>
    824 ///
    825 /// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
    826 ///
    827 /// \param __a
    828 ///    A 128-bit integer vector containing one of the source operands.
    829 /// \param __b
    830 ///    A 128-bit integer vector containing one of the source operands.
    831 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
    832 ///    differences between both operands.
    833 static __inline__ __m128i __DEFAULT_FN_ATTRS
    834 _mm_sad_epu8(__m128i __a, __m128i __b)
    835 {
    836   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
    837 }
    838 
    839 /// \brief Subtracts the corresponding 8-bit integer values in the operands.
    840 ///
    841 /// \headerfile <x86intrin.h>
    842 ///
    843 /// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
    844 ///
    845 /// \param __a
    846 ///    A 128-bit integer vector containing the minuends.
    847 /// \param __b
    848 ///    A 128-bit integer vector containing the subtrahends.
    849 /// \returns A 128-bit integer vector containing the differences of the values
    850 ///    in the operands.
    851 static __inline__ __m128i __DEFAULT_FN_ATTRS
    852 _mm_sub_epi8(__m128i __a, __m128i __b)
    853 {
    854   return (__m128i)((__v16qu)__a - (__v16qu)__b);
    855 }
    856 
    857 /// \brief Subtracts the corresponding 16-bit integer values in the operands.
    858 ///
    859 /// \headerfile <x86intrin.h>
    860 ///
    861 /// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
    862 ///
    863 /// \param __a
    864 ///    A 128-bit integer vector containing the minuends.
    865 /// \param __b
    866 ///    A 128-bit integer vector containing the subtrahends.
    867 /// \returns A 128-bit integer vector containing the differences of the values
    868 ///    in the operands.
    869 static __inline__ __m128i __DEFAULT_FN_ATTRS
    870 _mm_sub_epi16(__m128i __a, __m128i __b)
    871 {
    872   return (__m128i)((__v8hu)__a - (__v8hu)__b);
    873 }
    874 
    875 /// \brief Subtracts the corresponding 32-bit integer values in the operands.
    876 ///
    877 /// \headerfile <x86intrin.h>
    878 ///
    879 /// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
    880 ///
    881 /// \param __a
    882 ///    A 128-bit integer vector containing the minuends.
    883 /// \param __b
    884 ///    A 128-bit integer vector containing the subtrahends.
    885 /// \returns A 128-bit integer vector containing the differences of the values
    886 ///    in the operands.
    887 static __inline__ __m128i __DEFAULT_FN_ATTRS
    888 _mm_sub_epi32(__m128i __a, __m128i __b)
    889 {
    890   return (__m128i)((__v4su)__a - (__v4su)__b);
    891 }
    892 
    893 /// \brief Subtracts signed or unsigned 64-bit integer values and writes the
    894 ///    difference to the corresponding bits in the destination.
    895 ///
    896 /// \headerfile <x86intrin.h>
    897 ///
    898 /// This intrinsic corresponds to the \c PSUBQ instruction.
    899 ///
    900 /// \param __a
    901 ///    A 64-bit integer vector containing the minuend.
    902 /// \param __b
    903 ///    A 64-bit integer vector containing the subtrahend.
    904 /// \returns A 64-bit integer vector containing the difference of the values in
    905 ///    the operands.
    906 static __inline__ __m64 __DEFAULT_FN_ATTRS
    907 _mm_sub_si64(__m64 __a, __m64 __b)
    908 {
    909   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
    910 }
    911 
    912 /// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
    913 ///
    914 /// \headerfile <x86intrin.h>
    915 ///
    916 /// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
    917 ///
    918 /// \param __a
    919 ///    A 128-bit integer vector containing the minuends.
    920 /// \param __b
    921 ///    A 128-bit integer vector containing the subtrahends.
    922 /// \returns A 128-bit integer vector containing the differences of the values
    923 ///    in the operands.
    924 static __inline__ __m128i __DEFAULT_FN_ATTRS
    925 _mm_sub_epi64(__m128i __a, __m128i __b)
    926 {
    927   return (__m128i)((__v2du)__a - (__v2du)__b);
    928 }
    929 
    930 /// \brief Subtracts corresponding 8-bit signed integer values in the input and
    931 ///    returns the differences in the corresponding bytes in the destination.
    932 ///    Differences greater than 7Fh are saturated to 7Fh, and differences less
    933 ///    than 80h are saturated to 80h.
    934 ///
    935 /// \headerfile <x86intrin.h>
    936 ///
    937 /// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
    938 ///
    939 /// \param __a
    940 ///    A 128-bit integer vector containing the minuends.
    941 /// \param __b
    942 ///    A 128-bit integer vector containing the subtrahends.
    943 /// \returns A 128-bit integer vector containing the differences of the values
    944 ///    in the operands.
    945 static __inline__ __m128i __DEFAULT_FN_ATTRS
    946 _mm_subs_epi8(__m128i __a, __m128i __b)
    947 {
    948   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
    949 }
    950 
    951 /// \brief Subtracts corresponding 16-bit signed integer values in the input and
    952 ///    returns the differences in the corresponding bytes in the destination.
    953 ///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
    954 ///    than 8000h are saturated to 8000h.
    955 ///
    956 /// \headerfile <x86intrin.h>
    957 ///
    958 /// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
    959 ///
    960 /// \param __a
    961 ///    A 128-bit integer vector containing the minuends.
    962 /// \param __b
    963 ///    A 128-bit integer vector containing the subtrahends.
    964 /// \returns A 128-bit integer vector containing the differences of the values
    965 ///    in the operands.
    966 static __inline__ __m128i __DEFAULT_FN_ATTRS
    967 _mm_subs_epi16(__m128i __a, __m128i __b)
    968 {
    969   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
    970 }
    971 
    972 /// \brief Subtracts corresponding 8-bit unsigned integer values in the input
    973 ///    and returns the differences in the corresponding bytes in the
    974 ///    destination. Differences less than 00h are saturated to 00h.
    975 ///
    976 /// \headerfile <x86intrin.h>
    977 ///
    978 /// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
    979 ///
    980 /// \param __a
    981 ///    A 128-bit integer vector containing the minuends.
    982 /// \param __b
    983 ///    A 128-bit integer vector containing the subtrahends.
    984 /// \returns A 128-bit integer vector containing the unsigned integer
    985 ///    differences of the values in the operands.
    986 static __inline__ __m128i __DEFAULT_FN_ATTRS
    987 _mm_subs_epu8(__m128i __a, __m128i __b)
    988 {
    989   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
    990 }
    991 
    992 /// \brief Subtracts corresponding 16-bit unsigned integer values in the input
    993 ///    and returns the differences in the corresponding bytes in the
    994 ///    destination. Differences less than 0000h are saturated to 0000h.
    995 ///
    996 /// \headerfile <x86intrin.h>
    997 ///
    998 /// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
    999 ///
   1000 /// \param __a
   1001 ///    A 128-bit integer vector containing the minuends.
   1002 /// \param __b
   1003 ///    A 128-bit integer vector containing the subtrahends.
   1004 /// \returns A 128-bit integer vector containing the unsigned integer
   1005 ///    differences of the values in the operands.
   1006 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1007 _mm_subs_epu16(__m128i __a, __m128i __b)
   1008 {
   1009   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
   1010 }
   1011 
   1012 /// \brief Performs a bitwise AND of two 128-bit integer vectors.
   1013 ///
   1014 /// \headerfile <x86intrin.h>
   1015 ///
   1016 /// This intrinsic corresponds to the \c VPAND / PAND instruction.
   1017 ///
   1018 /// \param __a
   1019 ///    A 128-bit integer vector containing one of the source operands.
   1020 /// \param __b
   1021 ///    A 128-bit integer vector containing one of the source operands.
   1022 /// \returns A 128-bit integer vector containing the bitwise AND of the values
   1023 ///    in both operands.
   1024 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1025 _mm_and_si128(__m128i __a, __m128i __b)
   1026 {
   1027   return (__m128i)((__v2du)__a & (__v2du)__b);
   1028 }
   1029 
   1030 /// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
   1031 ///    one's complement of the values contained in the first source operand.
   1032 ///
   1033 /// \headerfile <x86intrin.h>
   1034 ///
   1035 /// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
   1036 ///
   1037 /// \param __a
   1038 ///    A 128-bit vector containing the left source operand. The one's complement
   1039 ///    of this value is used in the bitwise AND.
   1040 /// \param __b
   1041 ///    A 128-bit vector containing the right source operand.
   1042 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
   1043 ///    complement of the first operand and the values in the second operand.
   1044 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1045 _mm_andnot_si128(__m128i __a, __m128i __b)
   1046 {
   1047   return (__m128i)(~(__v2du)__a & (__v2du)__b);
   1048 }
   1049 /// \brief Performs a bitwise OR of two 128-bit integer vectors.
   1050 ///
   1051 /// \headerfile <x86intrin.h>
   1052 ///
   1053 /// This intrinsic corresponds to the \c VPOR / POR instruction.
   1054 ///
   1055 /// \param __a
   1056 ///    A 128-bit integer vector containing one of the source operands.
   1057 /// \param __b
   1058 ///    A 128-bit integer vector containing one of the source operands.
   1059 /// \returns A 128-bit integer vector containing the bitwise OR of the values
   1060 ///    in both operands.
   1061 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1062 _mm_or_si128(__m128i __a, __m128i __b)
   1063 {
   1064   return (__m128i)((__v2du)__a | (__v2du)__b);
   1065 }
   1066 
   1067 /// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
   1068 ///
   1069 /// \headerfile <x86intrin.h>
   1070 ///
   1071 /// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
   1072 ///
   1073 /// \param __a
   1074 ///    A 128-bit integer vector containing one of the source operands.
   1075 /// \param __b
   1076 ///    A 128-bit integer vector containing one of the source operands.
   1077 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
   1078 ///    values in both operands.
   1079 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1080 _mm_xor_si128(__m128i __a, __m128i __b)
   1081 {
   1082   return (__m128i)((__v2du)__a ^ (__v2du)__b);
   1083 }
   1084 
   1085 /// \brief Left-shifts the 128-bit integer vector operand by the specified
   1086 ///    number of bytes. Low-order bits are cleared.
   1087 ///
   1088 /// \headerfile <x86intrin.h>
   1089 ///
   1090 /// \code
   1091 /// __m128i _mm_slli_si128(__m128i a, const int imm);
   1092 /// \endcode
   1093 ///
   1094 /// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
   1095 ///
   1096 /// \param a
   1097 ///    A 128-bit integer vector containing the source operand.
   1098 /// \param imm
   1099 ///    An immediate value specifying the number of bytes to left-shift
   1100 ///    operand a.
   1101 /// \returns A 128-bit integer vector containing the left-shifted value.
   1102 #define _mm_slli_si128(a, imm) __extension__ ({                              \
   1103   (__m128i)__builtin_shufflevector(                                          \
   1104                                  (__v16qi)_mm_setzero_si128(),               \
   1105                                  (__v16qi)(__m128i)(a),                      \
   1106                                  ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
   1107                                  ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
   1108                                  ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
   1109                                  ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
   1110                                  ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
   1111                                  ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
   1112                                  ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
   1113                                  ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
   1114                                  ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
   1115                                  ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
   1116                                  ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
   1117                                  ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
   1118                                  ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
   1119                                  ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
   1120                                  ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
   1121                                  ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
   1122 
   1123 #define _mm_bslli_si128(a, imm) \
   1124   _mm_slli_si128((a), (imm))
   1125 
   1126 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
   1127 ///    by the specified number of bits. Low-order bits are cleared.
   1128 ///
   1129 /// \headerfile <x86intrin.h>
   1130 ///
   1131 /// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
   1132 ///
   1133 /// \param __a
   1134 ///    A 128-bit integer vector containing the source operand.
   1135 /// \param __count
   1136 ///    An integer value specifying the number of bits to left-shift each value
   1137 ///    in operand __a.
   1138 /// \returns A 128-bit integer vector containing the left-shifted values.
   1139 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1140 _mm_slli_epi16(__m128i __a, int __count)
   1141 {
   1142   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
   1143 }
   1144 
   1145 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
   1146 ///    by the specified number of bits. Low-order bits are cleared.
   1147 ///
   1148 /// \headerfile <x86intrin.h>
   1149 ///
   1150 /// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
   1151 ///
   1152 /// \param __a
   1153 ///    A 128-bit integer vector containing the source operand.
   1154 /// \param __count
   1155 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   1156 ///    to left-shift each value in operand __a.
   1157 /// \returns A 128-bit integer vector containing the left-shifted values.
   1158 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1159 _mm_sll_epi16(__m128i __a, __m128i __count)
   1160 {
   1161   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
   1162 }
   1163 
   1164 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
   1165 ///    by the specified number of bits. Low-order bits are cleared.
   1166 ///
   1167 /// \headerfile <x86intrin.h>
   1168 ///
   1169 /// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
   1170 ///
   1171 /// \param __a
   1172 ///    A 128-bit integer vector containing the source operand.
   1173 /// \param __count
   1174 ///    An integer value specifying the number of bits to left-shift each value
   1175 ///    in operand __a.
   1176 /// \returns A 128-bit integer vector containing the left-shifted values.
   1177 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1178 _mm_slli_epi32(__m128i __a, int __count)
   1179 {
   1180   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
   1181 }
   1182 
   1183 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
   1184 ///    by the specified number of bits. Low-order bits are cleared.
   1185 ///
   1186 /// \headerfile <x86intrin.h>
   1187 ///
   1188 /// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
   1189 ///
   1190 /// \param __a
   1191 ///    A 128-bit integer vector containing the source operand.
   1192 /// \param __count
   1193 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   1194 ///    to left-shift each value in operand __a.
   1195 /// \returns A 128-bit integer vector containing the left-shifted values.
   1196 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1197 _mm_sll_epi32(__m128i __a, __m128i __count)
   1198 {
   1199   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
   1200 }
   1201 
   1202 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
   1203 ///    by the specified number of bits. Low-order bits are cleared.
   1204 ///
   1205 /// \headerfile <x86intrin.h>
   1206 ///
   1207 /// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
   1208 ///
   1209 /// \param __a
   1210 ///    A 128-bit integer vector containing the source operand.
   1211 /// \param __count
   1212 ///    An integer value specifying the number of bits to left-shift each value
   1213 ///    in operand __a.
   1214 /// \returns A 128-bit integer vector containing the left-shifted values.
   1215 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1216 _mm_slli_epi64(__m128i __a, int __count)
   1217 {
   1218   return __builtin_ia32_psllqi128((__v2di)__a, __count);
   1219 }
   1220 
   1221 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
   1222 ///    by the specified number of bits. Low-order bits are cleared.
   1223 ///
   1224 /// \headerfile <x86intrin.h>
   1225 ///
   1226 /// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
   1227 ///
   1228 /// \param __a
   1229 ///    A 128-bit integer vector containing the source operand.
   1230 /// \param __count
   1231 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   1232 ///    to left-shift each value in operand __a.
   1233 /// \returns A 128-bit integer vector containing the left-shifted values.
   1234 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1235 _mm_sll_epi64(__m128i __a, __m128i __count)
   1236 {
   1237   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
   1238 }
   1239 
   1240 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
   1241 ///    by the specified number of bits. High-order bits are filled with the sign
   1242 ///    bit of the initial value.
   1243 ///
   1244 /// \headerfile <x86intrin.h>
   1245 ///
   1246 /// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
   1247 ///
   1248 /// \param __a
   1249 ///    A 128-bit integer vector containing the source operand.
   1250 /// \param __count
   1251 ///    An integer value specifying the number of bits to right-shift each value
   1252 ///    in operand __a.
   1253 /// \returns A 128-bit integer vector containing the right-shifted values.
   1254 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1255 _mm_srai_epi16(__m128i __a, int __count)
   1256 {
   1257   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
   1258 }
   1259 
   1260 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
   1261 ///    by the specified number of bits. High-order bits are filled with the sign
   1262 ///    bit of the initial value.
   1263 ///
   1264 /// \headerfile <x86intrin.h>
   1265 ///
   1266 /// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
   1267 ///
   1268 /// \param __a
   1269 ///    A 128-bit integer vector containing the source operand.
   1270 /// \param __count
   1271 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   1272 ///    to right-shift each value in operand __a.
   1273 /// \returns A 128-bit integer vector containing the right-shifted values.
   1274 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1275 _mm_sra_epi16(__m128i __a, __m128i __count)
   1276 {
   1277   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
   1278 }
   1279 
   1280 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
   1281 ///    by the specified number of bits. High-order bits are filled with the sign
   1282 ///    bit of the initial value.
   1283 ///
   1284 /// \headerfile <x86intrin.h>
   1285 ///
   1286 /// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
   1287 ///
   1288 /// \param __a
   1289 ///    A 128-bit integer vector containing the source operand.
   1290 /// \param __count
   1291 ///    An integer value specifying the number of bits to right-shift each value
   1292 ///    in operand __a.
   1293 /// \returns A 128-bit integer vector containing the right-shifted values.
   1294 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1295 _mm_srai_epi32(__m128i __a, int __count)
   1296 {
   1297   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
   1298 }
   1299 
   1300 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
   1301 ///    by the specified number of bits. High-order bits are filled with the sign
   1302 ///    bit of the initial value.
   1303 ///
   1304 /// \headerfile <x86intrin.h>
   1305 ///
   1306 /// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
   1307 ///
   1308 /// \param __a
   1309 ///    A 128-bit integer vector containing the source operand.
   1310 /// \param __count
   1311 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   1312 ///    to right-shift each value in operand __a.
   1313 /// \returns A 128-bit integer vector containing the right-shifted values.
   1314 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1315 _mm_sra_epi32(__m128i __a, __m128i __count)
   1316 {
   1317   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
   1318 }
   1319 
   1320 /// \brief Right-shifts the 128-bit integer vector operand by the specified
   1321 ///    number of bytes. High-order bits are cleared.
   1322 ///
   1323 /// \headerfile <x86intrin.h>
   1324 ///
   1325 /// \code
   1326 /// __m128i _mm_srli_si128(__m128i a, const int imm);
   1327 /// \endcode
   1328 ///
   1329 /// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
   1330 ///
   1331 /// \param a
   1332 ///    A 128-bit integer vector containing the source operand.
   1333 /// \param imm
   1334 ///    An immediate value specifying the number of bytes to right-shift operand
   1335 ///    a.
   1336 /// \returns A 128-bit integer vector containing the right-shifted value.
   1337 #define _mm_srli_si128(a, imm) __extension__ ({                              \
   1338   (__m128i)__builtin_shufflevector(                                          \
   1339                                  (__v16qi)(__m128i)(a),                      \
   1340                                  (__v16qi)_mm_setzero_si128(),               \
   1341                                  ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
   1342                                  ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
   1343                                  ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
   1344                                  ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
   1345                                  ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
   1346                                  ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
   1347                                  ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
   1348                                  ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
   1349                                  ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
   1350                                  ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
   1351                                  ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
   1352                                  ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
   1353                                  ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
   1354                                  ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
   1355                                  ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
   1356                                  ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
   1357 
   1358 #define _mm_bsrli_si128(a, imm) \
   1359   _mm_srli_si128((a), (imm))
   1360 
   1361 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
   1362 ///    operand by the specified number of bits. High-order bits are cleared.
   1363 ///
   1364 /// \headerfile <x86intrin.h>
   1365 ///
   1366 /// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
   1367 ///
   1368 /// \param __a
   1369 ///    A 128-bit integer vector containing the source operand.
   1370 /// \param __count
   1371 ///    An integer value specifying the number of bits to right-shift each value
   1372 ///    in operand __a.
   1373 /// \returns A 128-bit integer vector containing the right-shifted values.
   1374 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1375 _mm_srli_epi16(__m128i __a, int __count)
   1376 {
   1377   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
   1378 }
   1379 
   1380 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
   1381 ///    operand by the specified number of bits. High-order bits are cleared.
   1382 ///
   1383 /// \headerfile <x86intrin.h>
   1384 ///
   1385 /// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
   1386 ///
   1387 /// \param __a
   1388 ///    A 128-bit integer vector containing the source operand.
   1389 /// \param __count
   1390 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   1391 ///    to right-shift each value in operand __a.
   1392 /// \returns A 128-bit integer vector containing the right-shifted values.
   1393 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1394 _mm_srl_epi16(__m128i __a, __m128i __count)
   1395 {
   1396   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
   1397 }
   1398 
   1399 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
   1400 ///    operand by the specified number of bits. High-order bits are cleared.
   1401 ///
   1402 /// \headerfile <x86intrin.h>
   1403 ///
   1404 /// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
   1405 ///
   1406 /// \param __a
   1407 ///    A 128-bit integer vector containing the source operand.
   1408 /// \param __count
   1409 ///    An integer value specifying the number of bits to right-shift each value
   1410 ///    in operand __a.
   1411 /// \returns A 128-bit integer vector containing the right-shifted values.
   1412 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1413 _mm_srli_epi32(__m128i __a, int __count)
   1414 {
   1415   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
   1416 }
   1417 
   1418 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
   1419 ///    operand by the specified number of bits. High-order bits are cleared.
   1420 ///
   1421 /// \headerfile <x86intrin.h>
   1422 ///
   1423 /// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
   1424 ///
   1425 /// \param __a
   1426 ///    A 128-bit integer vector containing the source operand.
   1427 /// \param __count
   1428 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   1429 ///    to right-shift each value in operand __a.
   1430 /// \returns A 128-bit integer vector containing the right-shifted values.
   1431 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1432 _mm_srl_epi32(__m128i __a, __m128i __count)
   1433 {
   1434   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
   1435 }
   1436 
   1437 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
   1438 ///    operand by the specified number of bits. High-order bits are cleared.
   1439 ///
   1440 /// \headerfile <x86intrin.h>
   1441 ///
   1442 /// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
   1443 ///
   1444 /// \param __a
   1445 ///    A 128-bit integer vector containing the source operand.
   1446 /// \param __count
   1447 ///    An integer value specifying the number of bits to right-shift each value
   1448 ///    in operand __a.
   1449 /// \returns A 128-bit integer vector containing the right-shifted values.
   1450 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1451 _mm_srli_epi64(__m128i __a, int __count)
   1452 {
   1453   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
   1454 }
   1455 
   1456 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
   1457 ///    operand by the specified number of bits. High-order bits are cleared.
   1458 ///
   1459 /// \headerfile <x86intrin.h>
   1460 ///
   1461 /// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
   1462 ///
   1463 /// \param __a
   1464 ///    A 128-bit integer vector containing the source operand.
   1465 /// \param __count
   1466 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
   1467 ///    to right-shift each value in operand __a.
   1468 /// \returns A 128-bit integer vector containing the right-shifted values.
   1469 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1470 _mm_srl_epi64(__m128i __a, __m128i __count)
   1471 {
   1472   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
   1473 }
   1474 
   1475 /// \brief Compares each of the corresponding 8-bit values of the 128-bit
   1476 ///    integer vectors for equality. Each comparison yields 0h for false, FFh
   1477 ///    for true.
   1478 ///
   1479 /// \headerfile <x86intrin.h>
   1480 ///
   1481 /// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
   1482 ///
   1483 /// \param __a
   1484 ///    A 128-bit integer vector.
   1485 /// \param __b
   1486 ///    A 128-bit integer vector.
   1487 /// \returns A 128-bit integer vector containing the comparison results.
   1488 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1489 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
   1490 {
   1491   return (__m128i)((__v16qi)__a == (__v16qi)__b);
   1492 }
   1493 
   1494 /// \brief Compares each of the corresponding 16-bit values of the 128-bit
   1495 ///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
   1496 ///    for true.
   1497 ///
   1498 /// \headerfile <x86intrin.h>
   1499 ///
   1500 /// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
   1501 ///
   1502 /// \param __a
   1503 ///    A 128-bit integer vector.
   1504 /// \param __b
   1505 ///    A 128-bit integer vector.
   1506 /// \returns A 128-bit integer vector containing the comparison results.
   1507 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1508 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
   1509 {
   1510   return (__m128i)((__v8hi)__a == (__v8hi)__b);
   1511 }
   1512 
   1513 /// \brief Compares each of the corresponding 32-bit values of the 128-bit
   1514 ///    integer vectors for equality. Each comparison yields 0h for false,
   1515 ///    FFFFFFFFh for true.
   1516 ///
   1517 /// \headerfile <x86intrin.h>
   1518 ///
   1519 /// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
   1520 ///
   1521 /// \param __a
   1522 ///    A 128-bit integer vector.
   1523 /// \param __b
   1524 ///    A 128-bit integer vector.
   1525 /// \returns A 128-bit integer vector containing the comparison results.
   1526 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1527 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
   1528 {
   1529   return (__m128i)((__v4si)__a == (__v4si)__b);
   1530 }
   1531 
   1532 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
   1533 ///    integer vectors to determine if the values in the first operand are
   1534 ///    greater than those in the second operand. Each comparison yields 0h for
   1535 ///    false, FFh for true.
   1536 ///
   1537 /// \headerfile <x86intrin.h>
   1538 ///
   1539 /// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
   1540 ///
   1541 /// \param __a
   1542 ///    A 128-bit integer vector.
   1543 /// \param __b
   1544 ///    A 128-bit integer vector.
   1545 /// \returns A 128-bit integer vector containing the comparison results.
   1546 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1547 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
   1548 {
   1549   /* This function always performs a signed comparison, but __v16qi is a char
   1550      which may be signed or unsigned, so use __v16qs. */
   1551   return (__m128i)((__v16qs)__a > (__v16qs)__b);
   1552 }
   1553 
   1554 /// \brief Compares each of the corresponding signed 16-bit values of the
   1555 ///    128-bit integer vectors to determine if the values in the first operand
   1556 ///    are greater than those in the second operand. Each comparison yields 0h
   1557 ///    for false, FFFFh for true.
   1558 ///
   1559 /// \headerfile <x86intrin.h>
   1560 ///
   1561 /// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
   1562 ///
   1563 /// \param __a
   1564 ///    A 128-bit integer vector.
   1565 /// \param __b
   1566 ///    A 128-bit integer vector.
   1567 /// \returns A 128-bit integer vector containing the comparison results.
   1568 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1569 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
   1570 {
   1571   return (__m128i)((__v8hi)__a > (__v8hi)__b);
   1572 }
   1573 
   1574 /// \brief Compares each of the corresponding signed 32-bit values of the
   1575 ///    128-bit integer vectors to determine if the values in the first operand
   1576 ///    are greater than those in the second operand. Each comparison yields 0h
   1577 ///    for false, FFFFFFFFh for true.
   1578 ///
   1579 /// \headerfile <x86intrin.h>
   1580 ///
   1581 /// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
   1582 ///
   1583 /// \param __a
   1584 ///    A 128-bit integer vector.
   1585 /// \param __b
   1586 ///    A 128-bit integer vector.
   1587 /// \returns A 128-bit integer vector containing the comparison results.
   1588 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1589 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
   1590 {
   1591   return (__m128i)((__v4si)__a > (__v4si)__b);
   1592 }
   1593 
   1594 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
   1595 ///    integer vectors to determine if the values in the first operand are less
   1596 ///    than those in the second operand. Each comparison yields 0h for false,
   1597 ///    FFh for true.
   1598 ///
   1599 /// \headerfile <x86intrin.h>
   1600 ///
   1601 /// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
   1602 ///
   1603 /// \param __a
   1604 ///    A 128-bit integer vector.
   1605 /// \param __b
   1606 ///    A 128-bit integer vector.
   1607 /// \returns A 128-bit integer vector containing the comparison results.
   1608 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1609 _mm_cmplt_epi8(__m128i __a, __m128i __b)
   1610 {
   1611   return _mm_cmpgt_epi8(__b, __a);
   1612 }
   1613 
   1614 /// \brief Compares each of the corresponding signed 16-bit values of the
   1615 ///    128-bit integer vectors to determine if the values in the first operand
   1616 ///    are less than those in the second operand. Each comparison yields 0h for
   1617 ///    false, FFFFh for true.
   1618 ///
   1619 /// \headerfile <x86intrin.h>
   1620 ///
   1621 /// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
   1622 ///
   1623 /// \param __a
   1624 ///    A 128-bit integer vector.
   1625 /// \param __b
   1626 ///    A 128-bit integer vector.
   1627 /// \returns A 128-bit integer vector containing the comparison results.
   1628 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1629 _mm_cmplt_epi16(__m128i __a, __m128i __b)
   1630 {
   1631   return _mm_cmpgt_epi16(__b, __a);
   1632 }
   1633 
   1634 /// \brief Compares each of the corresponding signed 32-bit values of the
   1635 ///    128-bit integer vectors to determine if the values in the first operand
   1636 ///    are less than those in the second operand. Each comparison yields 0h for
   1637 ///    false, FFFFFFFFh for true.
   1638 ///
   1639 /// \headerfile <x86intrin.h>
   1640 ///
   1641 /// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
   1642 ///
   1643 /// \param __a
   1644 ///    A 128-bit integer vector.
   1645 /// \param __b
   1646 ///    A 128-bit integer vector.
   1647 /// \returns A 128-bit integer vector containing the comparison results.
   1648 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1649 _mm_cmplt_epi32(__m128i __a, __m128i __b)
   1650 {
   1651   return _mm_cmpgt_epi32(__b, __a);
   1652 }
   1653 
   1654 #ifdef __x86_64__
   1655 /// \brief Converts a 64-bit signed integer value from the second operand into a
   1656 ///    double-precision value and returns it in the lower element of a [2 x
   1657 ///    double] vector; the upper element of the returned vector is copied from
   1658 ///    the upper element of the first operand.
   1659 ///
   1660 /// \headerfile <x86intrin.h>
   1661 ///
   1662 /// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
   1663 ///
   1664 /// \param __a
   1665 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
   1666 ///    copied to the upper 64 bits of the destination.
   1667 /// \param __b
   1668 ///    A 64-bit signed integer operand containing the value to be converted.
   1669 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
   1670 ///    converted value of the second operand. The upper 64 bits are copied from
   1671 ///    the upper 64 bits of the first operand.
   1672 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1673 _mm_cvtsi64_sd(__m128d __a, long long __b)
   1674 {
   1675   __a[0] = __b;
   1676   return __a;
   1677 }
   1678 
   1679 /// \brief Converts the first (lower) element of a vector of [2 x double] into a
   1680 ///    64-bit signed integer value, according to the current rounding mode.
   1681 ///
   1682 /// \headerfile <x86intrin.h>
   1683 ///
   1684 /// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
   1685 ///
   1686 /// \param __a
   1687 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
   1688 ///    conversion.
   1689 /// \returns A 64-bit signed integer containing the converted value.
   1690 static __inline__ long long __DEFAULT_FN_ATTRS
   1691 _mm_cvtsd_si64(__m128d __a)
   1692 {
   1693   return __builtin_ia32_cvtsd2si64((__v2df)__a);
   1694 }
   1695 
   1696 /// \brief Converts the first (lower) element of a vector of [2 x double] into a
   1697 ///    64-bit signed integer value, truncating the result when it is inexact.
   1698 ///
   1699 /// \headerfile <x86intrin.h>
   1700 ///
   1701 /// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
   1702 ///
   1703 /// \param __a
   1704 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
   1705 ///    conversion.
   1706 /// \returns A 64-bit signed integer containing the converted value.
   1707 static __inline__ long long __DEFAULT_FN_ATTRS
   1708 _mm_cvttsd_si64(__m128d __a)
   1709 {
   1710   return __a[0];
   1711 }
   1712 #endif
   1713 
   1714 /// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
   1715 ///
   1716 /// \headerfile <x86intrin.h>
   1717 ///
   1718 /// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
   1719 ///
   1720 /// \param __a
   1721 ///    A 128-bit integer vector.
   1722 /// \returns A 128-bit vector of [4 x float] containing the converted values.
   1723 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1724 _mm_cvtepi32_ps(__m128i __a)
   1725 {
   1726   return __builtin_ia32_cvtdq2ps((__v4si)__a);
   1727 }
   1728 
   1729 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
   1730 ///
   1731 /// \headerfile <x86intrin.h>
   1732 ///
   1733 /// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
   1734 ///
   1735 /// \param __a
   1736 ///    A 128-bit vector of [4 x float].
   1737 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
   1738 ///    values.
   1739 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1740 _mm_cvtps_epi32(__m128 __a)
   1741 {
   1742   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
   1743 }
   1744 
   1745 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
   1746 ///    truncating the result when it is inexact.
   1747 ///
   1748 /// \headerfile <x86intrin.h>
   1749 ///
   1750 /// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
   1751 ///
   1752 /// \param __a
   1753 ///    A 128-bit vector of [4 x float].
   1754 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
   1755 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1756 _mm_cvttps_epi32(__m128 __a)
   1757 {
   1758   return (__m128i)__builtin_convertvector((__v4sf)__a, __v4si);
   1759 }
   1760 
   1761 /// \brief Returns a vector of [4 x i32] where the lowest element is the input
   1762 ///    operand and the remaining elements are zero.
   1763 ///
   1764 /// \headerfile <x86intrin.h>
   1765 ///
   1766 /// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
   1767 ///
   1768 /// \param __a
   1769 ///    A 32-bit signed integer operand.
   1770 /// \returns A 128-bit vector of [4 x i32].
   1771 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1772 _mm_cvtsi32_si128(int __a)
   1773 {
   1774   return (__m128i)(__v4si){ __a, 0, 0, 0 };
   1775 }
   1776 
   1777 #ifdef __x86_64__
   1778 /// \brief Returns a vector of [2 x i64] where the lower element is the input
   1779 ///    operand and the upper element is zero.
   1780 ///
   1781 /// \headerfile <x86intrin.h>
   1782 ///
   1783 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
   1784 ///
   1785 /// \param __a
   1786 ///    A 64-bit signed integer operand containing the value to be converted.
   1787 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
   1788 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1789 _mm_cvtsi64_si128(long long __a)
   1790 {
   1791   return (__m128i){ __a, 0 };
   1792 }
   1793 #endif
   1794 
   1795 /// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
   1796 ///    32-bit signed integer value.
   1797 ///
   1798 /// \headerfile <x86intrin.h>
   1799 ///
   1800 /// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
   1801 ///
   1802 /// \param __a
   1803 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
   1804 ///    destination.
   1805 /// \returns A 32-bit signed integer containing the moved value.
   1806 static __inline__ int __DEFAULT_FN_ATTRS
   1807 _mm_cvtsi128_si32(__m128i __a)
   1808 {
   1809   __v4si __b = (__v4si)__a;
   1810   return __b[0];
   1811 }
   1812 
   1813 #ifdef __x86_64__
   1814 /// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
   1815 ///    64-bit signed integer value.
   1816 ///
   1817 /// \headerfile <x86intrin.h>
   1818 ///
   1819 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
   1820 ///
   1821 /// \param __a
   1822 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
   1823 ///    destination.
   1824 /// \returns A 64-bit signed integer containing the moved value.
   1825 static __inline__ long long __DEFAULT_FN_ATTRS
   1826 _mm_cvtsi128_si64(__m128i __a)
   1827 {
   1828   return __a[0];
   1829 }
   1830 #endif
   1831 
   1832 /// \brief Moves packed integer values from an aligned 128-bit memory location
   1833 ///    to elements in a 128-bit integer vector.
   1834 ///
   1835 /// \headerfile <x86intrin.h>
   1836 ///
   1837 /// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
   1838 ///
   1839 /// \param __p
   1840 ///    An aligned pointer to a memory location containing integer values.
   1841 /// \returns A 128-bit integer vector containing the moved values.
   1842 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1843 _mm_load_si128(__m128i const *__p)
   1844 {
   1845   return *__p;
   1846 }
   1847 
   1848 /// \brief Moves packed integer values from an unaligned 128-bit memory location
   1849 ///    to elements in a 128-bit integer vector.
   1850 ///
   1851 /// \headerfile <x86intrin.h>
   1852 ///
   1853 /// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
   1854 ///
   1855 /// \param __p
   1856 ///    A pointer to a memory location containing integer values.
   1857 /// \returns A 128-bit integer vector containing the moved values.
   1858 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1859 _mm_loadu_si128(__m128i const *__p)
   1860 {
   1861   struct __loadu_si128 {
   1862     __m128i __v;
   1863   } __attribute__((__packed__, __may_alias__));
   1864   return ((struct __loadu_si128*)__p)->__v;
   1865 }
   1866 
   1867 /// \brief Returns a vector of [2 x i64] where the lower element is taken from
   1868 ///    the lower element of the operand, and the upper element is zero.
   1869 ///
   1870 /// \headerfile <x86intrin.h>
   1871 ///
   1872 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
   1873 ///
   1874 /// \param __p
   1875 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
   1876 ///    the destination.
   1877 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
   1878 ///    moved value. The higher order bits are cleared.
   1879 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1880 _mm_loadl_epi64(__m128i const *__p)
   1881 {
   1882   struct __mm_loadl_epi64_struct {
   1883     long long __u;
   1884   } __attribute__((__packed__, __may_alias__));
   1885   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
   1886 }
   1887 
   1888 /// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
   1889 ///    This could be used as an argument to another intrinsic function where the
   1890 ///    argument is required but the value is not actually used.
   1891 ///
   1892 /// \headerfile <x86intrin.h>
   1893 ///
   1894 /// This intrinsic has no corresponding instruction.
   1895 ///
   1896 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
   1897 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1898 _mm_undefined_si128(void)
   1899 {
   1900   return (__m128i)__builtin_ia32_undef128();
   1901 }
   1902 
   1903 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
   1904 ///    the specified 64-bit integer values.
   1905 ///
   1906 /// \headerfile <x86intrin.h>
   1907 ///
   1908 /// This intrinsic is a utility function and does not correspond to a specific
   1909 ///    instruction.
   1910 ///
   1911 /// \param __q1
   1912 ///    A 64-bit integer value used to initialize the upper 64 bits of the
   1913 ///    destination vector of [2 x i64].
   1914 /// \param __q0
   1915 ///    A 64-bit integer value used to initialize the lower 64 bits of the
   1916 ///    destination vector of [2 x i64].
   1917 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
   1918 ///    provided in the operands.
   1919 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1920 _mm_set_epi64x(long long __q1, long long __q0)
   1921 {
   1922   return (__m128i){ __q0, __q1 };
   1923 }
   1924 
   1925 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
   1926 ///    the specified 64-bit integer values.
   1927 ///
   1928 /// \headerfile <x86intrin.h>
   1929 ///
   1930 /// This intrinsic is a utility function and does not correspond to a specific
   1931 ///    instruction.
   1932 ///
   1933 /// \param __q1
   1934 ///    A 64-bit integer value used to initialize the upper 64 bits of the
   1935 ///    destination vector of [2 x i64].
   1936 /// \param __q0
   1937 ///    A 64-bit integer value used to initialize the lower 64 bits of the
   1938 ///    destination vector of [2 x i64].
   1939 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
   1940 ///    provided in the operands.
   1941 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1942 _mm_set_epi64(__m64 __q1, __m64 __q0)
   1943 {
   1944   return (__m128i){ (long long)__q0, (long long)__q1 };
   1945 }
   1946 
   1947 /// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
   1948 ///    the specified 32-bit integer values.
   1949 ///
   1950 /// \headerfile <x86intrin.h>
   1951 ///
   1952 /// This intrinsic is a utility function and does not correspond to a specific
   1953 ///    instruction.
   1954 ///
   1955 /// \param __i3
   1956 ///    A 32-bit integer value used to initialize bits [127:96] of the
   1957 ///    destination vector.
   1958 /// \param __i2
   1959 ///    A 32-bit integer value used to initialize bits [95:64] of the destination
   1960 ///    vector.
   1961 /// \param __i1
   1962 ///    A 32-bit integer value used to initialize bits [63:32] of the destination
   1963 ///    vector.
   1964 /// \param __i0
   1965 ///    A 32-bit integer value used to initialize bits [31:0] of the destination
   1966 ///    vector.
   1967 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
   1968 ///    provided in the operands.
   1969 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1970 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
   1971 {
   1972   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
   1973 }
   1974 
   1975 /// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
   1976 ///    the specified 16-bit integer values.
   1977 ///
   1978 /// \headerfile <x86intrin.h>
   1979 ///
   1980 /// This intrinsic is a utility function and does not correspond to a specific
   1981 ///    instruction.
   1982 ///
   1983 /// \param __w7
   1984 ///    A 16-bit integer value used to initialize bits [127:112] of the
   1985 ///    destination vector.
   1986 /// \param __w6
   1987 ///    A 16-bit integer value used to initialize bits [111:96] of the
   1988 ///    destination vector.
   1989 /// \param __w5
   1990 ///    A 16-bit integer value used to initialize bits [95:80] of the destination
   1991 ///    vector.
   1992 /// \param __w4
   1993 ///    A 16-bit integer value used to initialize bits [79:64] of the destination
   1994 ///    vector.
   1995 /// \param __w3
   1996 ///    A 16-bit integer value used to initialize bits [63:48] of the destination
   1997 ///    vector.
   1998 /// \param __w2
   1999 ///    A 16-bit integer value used to initialize bits [47:32] of the destination
   2000 ///    vector.
   2001 /// \param __w1
   2002 ///    A 16-bit integer value used to initialize bits [31:16] of the destination
   2003 ///    vector.
   2004 /// \param __w0
   2005 ///    A 16-bit integer value used to initialize bits [15:0] of the destination
   2006 ///    vector.
   2007 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
   2008 ///    provided in the operands.
   2009 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2010 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
   2011 {
   2012   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
   2013 }
   2014 
   2015 /// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
   2016 ///    the specified 8-bit integer values.
   2017 ///
   2018 /// \headerfile <x86intrin.h>
   2019 ///
   2020 /// This intrinsic is a utility function and does not correspond to a specific
   2021 ///    instruction.
   2022 ///
   2023 /// \param __b15
   2024 ///    Initializes bits [127:120] of the destination vector.
   2025 /// \param __b14
   2026 ///    Initializes bits [119:112] of the destination vector.
   2027 /// \param __b13
   2028 ///    Initializes bits [111:104] of the destination vector.
   2029 /// \param __b12
   2030 ///    Initializes bits [103:96] of the destination vector.
   2031 /// \param __b11
   2032 ///    Initializes bits [95:88] of the destination vector.
   2033 /// \param __b10
   2034 ///    Initializes bits [87:80] of the destination vector.
   2035 /// \param __b9
   2036 ///    Initializes bits [79:72] of the destination vector.
   2037 /// \param __b8
   2038 ///    Initializes bits [71:64] of the destination vector.
   2039 /// \param __b7
   2040 ///    Initializes bits [63:56] of the destination vector.
   2041 /// \param __b6
   2042 ///    Initializes bits [55:48] of the destination vector.
   2043 /// \param __b5
   2044 ///    Initializes bits [47:40] of the destination vector.
   2045 /// \param __b4
   2046 ///    Initializes bits [39:32] of the destination vector.
   2047 /// \param __b3
   2048 ///    Initializes bits [31:24] of the destination vector.
   2049 /// \param __b2
   2050 ///    Initializes bits [23:16] of the destination vector.
   2051 /// \param __b1
   2052 ///    Initializes bits [15:8] of the destination vector.
   2053 /// \param __b0
   2054 ///    Initializes bits [7:0] of the destination vector.
   2055 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
   2056 ///    provided in the operands.
   2057 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2058 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
   2059 {
   2060   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
   2061 }
   2062 
   2063 /// \brief Initializes both values in a 128-bit integer vector with the
   2064 ///    specified 64-bit integer value.
   2065 ///
   2066 /// \headerfile <x86intrin.h>
   2067 ///
   2068 /// This intrinsic is a utility function and does not correspond to a specific
   2069 ///    instruction.
   2070 ///
   2071 /// \param __q
   2072 ///    Integer value used to initialize the elements of the destination integer
   2073 ///    vector.
   2074 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
   2075 ///    elements containing the value provided in the operand.
   2076 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2077 _mm_set1_epi64x(long long __q)
   2078 {
   2079   return (__m128i){ __q, __q };
   2080 }
   2081 
   2082 /// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
   2083 ///    specified 64-bit value.
   2084 ///
   2085 /// \headerfile <x86intrin.h>
   2086 ///
   2087 /// This intrinsic is a utility function and does not correspond to a specific
   2088 ///    instruction.
   2089 ///
   2090 /// \param __q
   2091 ///    A 64-bit value used to initialize the elements of the destination integer
   2092 ///    vector.
   2093 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
   2094 ///    containing the value provided in the operand.
   2095 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2096 _mm_set1_epi64(__m64 __q)
   2097 {
   2098   return (__m128i){ (long long)__q, (long long)__q };
   2099 }
   2100 
   2101 /// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
   2102 ///    specified 32-bit value.
   2103 ///
   2104 /// \headerfile <x86intrin.h>
   2105 ///
   2106 /// This intrinsic is a utility function and does not correspond to a specific
   2107 ///    instruction.
   2108 ///
   2109 /// \param __i
   2110 ///    A 32-bit value used to initialize the elements of the destination integer
   2111 ///    vector.
   2112 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
   2113 ///    containing the value provided in the operand.
   2114 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2115 _mm_set1_epi32(int __i)
   2116 {
   2117   return (__m128i)(__v4si){ __i, __i, __i, __i };
   2118 }
   2119 
   2120 /// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
   2121 ///    specified 16-bit value.
   2122 ///
   2123 /// \headerfile <x86intrin.h>
   2124 ///
   2125 /// This intrinsic is a utility function and does not correspond to a specific
   2126 ///    instruction.
   2127 ///
   2128 /// \param __w
   2129 ///    A 16-bit value used to initialize the elements of the destination integer
   2130 ///    vector.
   2131 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
   2132 ///    containing the value provided in the operand.
   2133 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2134 _mm_set1_epi16(short __w)
   2135 {
   2136   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
   2137 }
   2138 
   2139 /// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
   2140 ///    specified 8-bit value.
   2141 ///
   2142 /// \headerfile <x86intrin.h>
   2143 ///
   2144 /// This intrinsic is a utility function and does not correspond to a specific
   2145 ///    instruction.
   2146 ///
   2147 /// \param __b
   2148 ///    An 8-bit value used to initialize the elements of the destination integer
   2149 ///    vector.
   2150 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
   2151 ///    containing the value provided in the operand.
   2152 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2153 _mm_set1_epi8(char __b)
   2154 {
   2155   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
   2156 }
   2157 
   2158 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2159 _mm_setr_epi64(__m64 __q0, __m64 __q1)
   2160 {
   2161   return (__m128i){ (long long)__q0, (long long)__q1 };
   2162 }
   2163 
   2164 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2165 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
   2166 {
   2167   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
   2168 }
   2169 
   2170 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2171 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
   2172 {
   2173   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
   2174 }
   2175 
   2176 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2177 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
   2178 {
   2179   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
   2180 }
   2181 
   2182 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2183 _mm_setzero_si128(void)
   2184 {
   2185   return (__m128i){ 0LL, 0LL };
   2186 }
   2187 
   2188 static __inline__ void __DEFAULT_FN_ATTRS
   2189 _mm_store_si128(__m128i *__p, __m128i __b)
   2190 {
   2191   *__p = __b;
   2192 }
   2193 
   2194 static __inline__ void __DEFAULT_FN_ATTRS
   2195 _mm_storeu_si128(__m128i *__p, __m128i __b)
   2196 {
   2197   struct __storeu_si128 {
   2198     __m128i __v;
   2199   } __attribute__((__packed__, __may_alias__));
   2200   ((struct __storeu_si128*)__p)->__v = __b;
   2201 }
   2202 
   2203 static __inline__ void __DEFAULT_FN_ATTRS
   2204 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
   2205 {
   2206   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
   2207 }
   2208 
   2209 static __inline__ void __DEFAULT_FN_ATTRS
   2210 _mm_storel_epi64(__m128i *__p, __m128i __a)
   2211 {
   2212   struct __mm_storel_epi64_struct {
   2213     long long __u;
   2214   } __attribute__((__packed__, __may_alias__));
   2215   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
   2216 }
   2217 
   2218 static __inline__ void __DEFAULT_FN_ATTRS
   2219 _mm_stream_pd(double *__p, __m128d __a)
   2220 {
   2221   __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
   2222 }
   2223 
   2224 static __inline__ void __DEFAULT_FN_ATTRS
   2225 _mm_stream_si128(__m128i *__p, __m128i __a)
   2226 {
   2227   __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
   2228 }
   2229 
   2230 static __inline__ void __DEFAULT_FN_ATTRS
   2231 _mm_stream_si32(int *__p, int __a)
   2232 {
   2233   __builtin_ia32_movnti(__p, __a);
   2234 }
   2235 
   2236 #ifdef __x86_64__
   2237 static __inline__ void __DEFAULT_FN_ATTRS
   2238 _mm_stream_si64(long long *__p, long long __a)
   2239 {
   2240   __builtin_ia32_movnti64(__p, __a);
   2241 }
   2242 #endif
   2243 
   2244 static __inline__ void __DEFAULT_FN_ATTRS
   2245 _mm_clflush(void const *__p)
   2246 {
   2247   __builtin_ia32_clflush(__p);
   2248 }
   2249 
   2250 static __inline__ void __DEFAULT_FN_ATTRS
   2251 _mm_lfence(void)
   2252 {
   2253   __builtin_ia32_lfence();
   2254 }
   2255 
   2256 static __inline__ void __DEFAULT_FN_ATTRS
   2257 _mm_mfence(void)
   2258 {
   2259   __builtin_ia32_mfence();
   2260 }
   2261 
   2262 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2263 _mm_packs_epi16(__m128i __a, __m128i __b)
   2264 {
   2265   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
   2266 }
   2267 
   2268 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2269 _mm_packs_epi32(__m128i __a, __m128i __b)
   2270 {
   2271   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
   2272 }
   2273 
   2274 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2275 _mm_packus_epi16(__m128i __a, __m128i __b)
   2276 {
   2277   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
   2278 }
   2279 
   2280 static __inline__ int __DEFAULT_FN_ATTRS
   2281 _mm_extract_epi16(__m128i __a, int __imm)
   2282 {
   2283   __v8hi __b = (__v8hi)__a;
   2284   return (unsigned short)__b[__imm & 7];
   2285 }
   2286 
   2287 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2288 _mm_insert_epi16(__m128i __a, int __b, int __imm)
   2289 {
   2290   __v8hi __c = (__v8hi)__a;
   2291   __c[__imm & 7] = __b;
   2292   return (__m128i)__c;
   2293 }
   2294 
   2295 static __inline__ int __DEFAULT_FN_ATTRS
   2296 _mm_movemask_epi8(__m128i __a)
   2297 {
   2298   return __builtin_ia32_pmovmskb128((__v16qi)__a);
   2299 }
   2300 
   2301 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
   2302   (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
   2303                                    (__v4si)_mm_undefined_si128(), \
   2304                                    ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
   2305                                    ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
   2306 
   2307 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
   2308   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
   2309                                    (__v8hi)_mm_undefined_si128(), \
   2310                                    ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
   2311                                    ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
   2312                                    4, 5, 6, 7); })
   2313 
   2314 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
   2315   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
   2316                                    (__v8hi)_mm_undefined_si128(), \
   2317                                    0, 1, 2, 3, \
   2318                                    4 + (((imm) >> 0) & 0x3), \
   2319                                    4 + (((imm) >> 2) & 0x3), \
   2320                                    4 + (((imm) >> 4) & 0x3), \
   2321                                    4 + (((imm) >> 6) & 0x3)); })
   2322 
   2323 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2324 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
   2325 {
   2326   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
   2327 }
   2328 
   2329 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2330 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
   2331 {
   2332   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
   2333 }
   2334 
   2335 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2336 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
   2337 {
   2338   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
   2339 }
   2340 
   2341 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2342 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
   2343 {
   2344   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
   2345 }
   2346 
   2347 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2348 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
   2349 {
   2350   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
   2351 }
   2352 
   2353 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2354 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
   2355 {
   2356   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
   2357 }
   2358 
   2359 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2360 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
   2361 {
   2362   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
   2363 }
   2364 
   2365 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2366 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
   2367 {
   2368   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
   2369 }
   2370 
   2371 static __inline__ __m64 __DEFAULT_FN_ATTRS
   2372 _mm_movepi64_pi64(__m128i __a)
   2373 {
   2374   return (__m64)__a[0];
   2375 }
   2376 
   2377 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2378 _mm_movpi64_epi64(__m64 __a)
   2379 {
   2380   return (__m128i){ (long long)__a, 0 };
   2381 }
   2382 
   2383 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2384 _mm_move_epi64(__m128i __a)
   2385 {
   2386   return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
   2387 }
   2388 
   2389 static __inline__ __m128d __DEFAULT_FN_ATTRS
   2390 _mm_unpackhi_pd(__m128d __a, __m128d __b)
   2391 {
   2392   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
   2393 }
   2394 
   2395 static __inline__ __m128d __DEFAULT_FN_ATTRS
   2396 _mm_unpacklo_pd(__m128d __a, __m128d __b)
   2397 {
   2398   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
   2399 }
   2400 
   2401 static __inline__ int __DEFAULT_FN_ATTRS
   2402 _mm_movemask_pd(__m128d __a)
   2403 {
   2404   return __builtin_ia32_movmskpd((__v2df)__a);
   2405 }
   2406 
   2407 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
   2408   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
   2409                                    0 + (((i) >> 0) & 0x1), \
   2410                                    2 + (((i) >> 1) & 0x1)); })
   2411 
   2412 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2413 _mm_castpd_ps(__m128d __a)
   2414 {
   2415   return (__m128)__a;
   2416 }
   2417 
   2418 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2419 _mm_castpd_si128(__m128d __a)
   2420 {
   2421   return (__m128i)__a;
   2422 }
   2423 
   2424 static __inline__ __m128d __DEFAULT_FN_ATTRS
   2425 _mm_castps_pd(__m128 __a)
   2426 {
   2427   return (__m128d)__a;
   2428 }
   2429 
   2430 static __inline__ __m128i __DEFAULT_FN_ATTRS
   2431 _mm_castps_si128(__m128 __a)
   2432 {
   2433   return (__m128i)__a;
   2434 }
   2435 
   2436 static __inline__ __m128 __DEFAULT_FN_ATTRS
   2437 _mm_castsi128_ps(__m128i __a)
   2438 {
   2439   return (__m128)__a;
   2440 }
   2441 
   2442 static __inline__ __m128d __DEFAULT_FN_ATTRS
   2443 _mm_castsi128_pd(__m128i __a)
   2444 {
   2445   return (__m128d)__a;
   2446 }
   2447 
   2448 static __inline__ void __DEFAULT_FN_ATTRS
   2449 _mm_pause(void)
   2450 {
   2451   __builtin_ia32_pause();
   2452 }
   2453 
   2454 #undef __DEFAULT_FN_ATTRS
   2455 
   2456 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
   2457 
   2458 #endif /* __EMMINTRIN_H */
   2459