Home | History | Annotate | Download | only in include
      1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __EMMINTRIN_H
     25 #define __EMMINTRIN_H
     26 
     27 #include <xmmintrin.h>
     28 
     29 typedef double __m128d __attribute__((__vector_size__(16)));
     30 typedef long long __m128i __attribute__((__vector_size__(16)));
     31 
     32 /* Type defines.  */
     33 typedef double __v2df __attribute__ ((__vector_size__ (16)));
     34 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
     35 typedef short __v8hi __attribute__((__vector_size__(16)));
     36 typedef char __v16qi __attribute__((__vector_size__(16)));
     37 
     38 /* We need an explicitly signed variant for char. Note that this shouldn't
     39  * appear in the interface though. */
     40 typedef signed char __v16qs __attribute__((__vector_size__(16)));
     41 
     42 #include <f16cintrin.h>
     43 
     44 /* Define the default attributes for the functions in this file. */
     45 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
     46 
     47 static __inline__ __m128d __DEFAULT_FN_ATTRS
     48 _mm_add_sd(__m128d __a, __m128d __b)
     49 {
     50   __a[0] += __b[0];
     51   return __a;
     52 }
     53 
     54 static __inline__ __m128d __DEFAULT_FN_ATTRS
     55 _mm_add_pd(__m128d __a, __m128d __b)
     56 {
     57   return __a + __b;
     58 }
     59 
     60 static __inline__ __m128d __DEFAULT_FN_ATTRS
     61 _mm_sub_sd(__m128d __a, __m128d __b)
     62 {
     63   __a[0] -= __b[0];
     64   return __a;
     65 }
     66 
     67 static __inline__ __m128d __DEFAULT_FN_ATTRS
     68 _mm_sub_pd(__m128d __a, __m128d __b)
     69 {
     70   return __a - __b;
     71 }
     72 
     73 static __inline__ __m128d __DEFAULT_FN_ATTRS
     74 _mm_mul_sd(__m128d __a, __m128d __b)
     75 {
     76   __a[0] *= __b[0];
     77   return __a;
     78 }
     79 
     80 static __inline__ __m128d __DEFAULT_FN_ATTRS
     81 _mm_mul_pd(__m128d __a, __m128d __b)
     82 {
     83   return __a * __b;
     84 }
     85 
     86 static __inline__ __m128d __DEFAULT_FN_ATTRS
     87 _mm_div_sd(__m128d __a, __m128d __b)
     88 {
     89   __a[0] /= __b[0];
     90   return __a;
     91 }
     92 
     93 static __inline__ __m128d __DEFAULT_FN_ATTRS
     94 _mm_div_pd(__m128d __a, __m128d __b)
     95 {
     96   return __a / __b;
     97 }
     98 
     99 static __inline__ __m128d __DEFAULT_FN_ATTRS
    100 _mm_sqrt_sd(__m128d __a, __m128d __b)
    101 {
    102   __m128d __c = __builtin_ia32_sqrtsd(__b);
    103   return (__m128d) { __c[0], __a[1] };
    104 }
    105 
    106 static __inline__ __m128d __DEFAULT_FN_ATTRS
    107 _mm_sqrt_pd(__m128d __a)
    108 {
    109   return __builtin_ia32_sqrtpd(__a);
    110 }
    111 
    112 static __inline__ __m128d __DEFAULT_FN_ATTRS
    113 _mm_min_sd(__m128d __a, __m128d __b)
    114 {
    115   return __builtin_ia32_minsd(__a, __b);
    116 }
    117 
    118 static __inline__ __m128d __DEFAULT_FN_ATTRS
    119 _mm_min_pd(__m128d __a, __m128d __b)
    120 {
    121   return __builtin_ia32_minpd(__a, __b);
    122 }
    123 
    124 static __inline__ __m128d __DEFAULT_FN_ATTRS
    125 _mm_max_sd(__m128d __a, __m128d __b)
    126 {
    127   return __builtin_ia32_maxsd(__a, __b);
    128 }
    129 
    130 static __inline__ __m128d __DEFAULT_FN_ATTRS
    131 _mm_max_pd(__m128d __a, __m128d __b)
    132 {
    133   return __builtin_ia32_maxpd(__a, __b);
    134 }
    135 
    136 static __inline__ __m128d __DEFAULT_FN_ATTRS
    137 _mm_and_pd(__m128d __a, __m128d __b)
    138 {
    139   return (__m128d)((__v4si)__a & (__v4si)__b);
    140 }
    141 
    142 static __inline__ __m128d __DEFAULT_FN_ATTRS
    143 _mm_andnot_pd(__m128d __a, __m128d __b)
    144 {
    145   return (__m128d)(~(__v4si)__a & (__v4si)__b);
    146 }
    147 
    148 static __inline__ __m128d __DEFAULT_FN_ATTRS
    149 _mm_or_pd(__m128d __a, __m128d __b)
    150 {
    151   return (__m128d)((__v4si)__a | (__v4si)__b);
    152 }
    153 
    154 static __inline__ __m128d __DEFAULT_FN_ATTRS
    155 _mm_xor_pd(__m128d __a, __m128d __b)
    156 {
    157   return (__m128d)((__v4si)__a ^ (__v4si)__b);
    158 }
    159 
    160 static __inline__ __m128d __DEFAULT_FN_ATTRS
    161 _mm_cmpeq_pd(__m128d __a, __m128d __b)
    162 {
    163   return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
    164 }
    165 
    166 static __inline__ __m128d __DEFAULT_FN_ATTRS
    167 _mm_cmplt_pd(__m128d __a, __m128d __b)
    168 {
    169   return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
    170 }
    171 
    172 static __inline__ __m128d __DEFAULT_FN_ATTRS
    173 _mm_cmple_pd(__m128d __a, __m128d __b)
    174 {
    175   return (__m128d)__builtin_ia32_cmplepd(__a, __b);
    176 }
    177 
    178 static __inline__ __m128d __DEFAULT_FN_ATTRS
    179 _mm_cmpgt_pd(__m128d __a, __m128d __b)
    180 {
    181   return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
    182 }
    183 
    184 static __inline__ __m128d __DEFAULT_FN_ATTRS
    185 _mm_cmpge_pd(__m128d __a, __m128d __b)
    186 {
    187   return (__m128d)__builtin_ia32_cmplepd(__b, __a);
    188 }
    189 
    190 static __inline__ __m128d __DEFAULT_FN_ATTRS
    191 _mm_cmpord_pd(__m128d __a, __m128d __b)
    192 {
    193   return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
    194 }
    195 
    196 static __inline__ __m128d __DEFAULT_FN_ATTRS
    197 _mm_cmpunord_pd(__m128d __a, __m128d __b)
    198 {
    199   return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
    200 }
    201 
    202 static __inline__ __m128d __DEFAULT_FN_ATTRS
    203 _mm_cmpneq_pd(__m128d __a, __m128d __b)
    204 {
    205   return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
    206 }
    207 
    208 static __inline__ __m128d __DEFAULT_FN_ATTRS
    209 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
    210 {
    211   return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
    212 }
    213 
    214 static __inline__ __m128d __DEFAULT_FN_ATTRS
    215 _mm_cmpnle_pd(__m128d __a, __m128d __b)
    216 {
    217   return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
    218 }
    219 
    220 static __inline__ __m128d __DEFAULT_FN_ATTRS
    221 _mm_cmpngt_pd(__m128d __a, __m128d __b)
    222 {
    223   return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
    224 }
    225 
    226 static __inline__ __m128d __DEFAULT_FN_ATTRS
    227 _mm_cmpnge_pd(__m128d __a, __m128d __b)
    228 {
    229   return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
    230 }
    231 
    232 static __inline__ __m128d __DEFAULT_FN_ATTRS
    233 _mm_cmpeq_sd(__m128d __a, __m128d __b)
    234 {
    235   return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
    236 }
    237 
    238 static __inline__ __m128d __DEFAULT_FN_ATTRS
    239 _mm_cmplt_sd(__m128d __a, __m128d __b)
    240 {
    241   return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
    242 }
    243 
    244 static __inline__ __m128d __DEFAULT_FN_ATTRS
    245 _mm_cmple_sd(__m128d __a, __m128d __b)
    246 {
    247   return (__m128d)__builtin_ia32_cmplesd(__a, __b);
    248 }
    249 
    250 static __inline__ __m128d __DEFAULT_FN_ATTRS
    251 _mm_cmpgt_sd(__m128d __a, __m128d __b)
    252 {
    253   __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
    254   return (__m128d) { __c[0], __a[1] };
    255 }
    256 
    257 static __inline__ __m128d __DEFAULT_FN_ATTRS
    258 _mm_cmpge_sd(__m128d __a, __m128d __b)
    259 {
    260   __m128d __c = __builtin_ia32_cmplesd(__b, __a);
    261   return (__m128d) { __c[0], __a[1] };
    262 }
    263 
    264 static __inline__ __m128d __DEFAULT_FN_ATTRS
    265 _mm_cmpord_sd(__m128d __a, __m128d __b)
    266 {
    267   return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
    268 }
    269 
    270 static __inline__ __m128d __DEFAULT_FN_ATTRS
    271 _mm_cmpunord_sd(__m128d __a, __m128d __b)
    272 {
    273   return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
    274 }
    275 
    276 static __inline__ __m128d __DEFAULT_FN_ATTRS
    277 _mm_cmpneq_sd(__m128d __a, __m128d __b)
    278 {
    279   return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
    280 }
    281 
    282 static __inline__ __m128d __DEFAULT_FN_ATTRS
    283 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
    284 {
    285   return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
    286 }
    287 
    288 static __inline__ __m128d __DEFAULT_FN_ATTRS
    289 _mm_cmpnle_sd(__m128d __a, __m128d __b)
    290 {
    291   return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
    292 }
    293 
    294 static __inline__ __m128d __DEFAULT_FN_ATTRS
    295 _mm_cmpngt_sd(__m128d __a, __m128d __b)
    296 {
    297   __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
    298   return (__m128d) { __c[0], __a[1] };
    299 }
    300 
    301 static __inline__ __m128d __DEFAULT_FN_ATTRS
    302 _mm_cmpnge_sd(__m128d __a, __m128d __b)
    303 {
    304   __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
    305   return (__m128d) { __c[0], __a[1] };
    306 }
    307 
    308 static __inline__ int __DEFAULT_FN_ATTRS
    309 _mm_comieq_sd(__m128d __a, __m128d __b)
    310 {
    311   return __builtin_ia32_comisdeq(__a, __b);
    312 }
    313 
    314 static __inline__ int __DEFAULT_FN_ATTRS
    315 _mm_comilt_sd(__m128d __a, __m128d __b)
    316 {
    317   return __builtin_ia32_comisdlt(__a, __b);
    318 }
    319 
    320 static __inline__ int __DEFAULT_FN_ATTRS
    321 _mm_comile_sd(__m128d __a, __m128d __b)
    322 {
    323   return __builtin_ia32_comisdle(__a, __b);
    324 }
    325 
    326 static __inline__ int __DEFAULT_FN_ATTRS
    327 _mm_comigt_sd(__m128d __a, __m128d __b)
    328 {
    329   return __builtin_ia32_comisdgt(__a, __b);
    330 }
    331 
    332 static __inline__ int __DEFAULT_FN_ATTRS
    333 _mm_comige_sd(__m128d __a, __m128d __b)
    334 {
    335   return __builtin_ia32_comisdge(__a, __b);
    336 }
    337 
    338 static __inline__ int __DEFAULT_FN_ATTRS
    339 _mm_comineq_sd(__m128d __a, __m128d __b)
    340 {
    341   return __builtin_ia32_comisdneq(__a, __b);
    342 }
    343 
    344 static __inline__ int __DEFAULT_FN_ATTRS
    345 _mm_ucomieq_sd(__m128d __a, __m128d __b)
    346 {
    347   return __builtin_ia32_ucomisdeq(__a, __b);
    348 }
    349 
    350 static __inline__ int __DEFAULT_FN_ATTRS
    351 _mm_ucomilt_sd(__m128d __a, __m128d __b)
    352 {
    353   return __builtin_ia32_ucomisdlt(__a, __b);
    354 }
    355 
    356 static __inline__ int __DEFAULT_FN_ATTRS
    357 _mm_ucomile_sd(__m128d __a, __m128d __b)
    358 {
    359   return __builtin_ia32_ucomisdle(__a, __b);
    360 }
    361 
    362 static __inline__ int __DEFAULT_FN_ATTRS
    363 _mm_ucomigt_sd(__m128d __a, __m128d __b)
    364 {
    365   return __builtin_ia32_ucomisdgt(__a, __b);
    366 }
    367 
    368 static __inline__ int __DEFAULT_FN_ATTRS
    369 _mm_ucomige_sd(__m128d __a, __m128d __b)
    370 {
    371   return __builtin_ia32_ucomisdge(__a, __b);
    372 }
    373 
    374 static __inline__ int __DEFAULT_FN_ATTRS
    375 _mm_ucomineq_sd(__m128d __a, __m128d __b)
    376 {
    377   return __builtin_ia32_ucomisdneq(__a, __b);
    378 }
    379 
    380 static __inline__ __m128 __DEFAULT_FN_ATTRS
    381 _mm_cvtpd_ps(__m128d __a)
    382 {
    383   return __builtin_ia32_cvtpd2ps(__a);
    384 }
    385 
    386 static __inline__ __m128d __DEFAULT_FN_ATTRS
    387 _mm_cvtps_pd(__m128 __a)
    388 {
    389   return __builtin_ia32_cvtps2pd(__a);
    390 }
    391 
    392 static __inline__ __m128d __DEFAULT_FN_ATTRS
    393 _mm_cvtepi32_pd(__m128i __a)
    394 {
    395   return __builtin_ia32_cvtdq2pd((__v4si)__a);
    396 }
    397 
    398 static __inline__ __m128i __DEFAULT_FN_ATTRS
    399 _mm_cvtpd_epi32(__m128d __a)
    400 {
    401   return __builtin_ia32_cvtpd2dq(__a);
    402 }
    403 
    404 static __inline__ int __DEFAULT_FN_ATTRS
    405 _mm_cvtsd_si32(__m128d __a)
    406 {
    407   return __builtin_ia32_cvtsd2si(__a);
    408 }
    409 
    410 static __inline__ __m128 __DEFAULT_FN_ATTRS
    411 _mm_cvtsd_ss(__m128 __a, __m128d __b)
    412 {
    413   __a[0] = __b[0];
    414   return __a;
    415 }
    416 
    417 static __inline__ __m128d __DEFAULT_FN_ATTRS
    418 _mm_cvtsi32_sd(__m128d __a, int __b)
    419 {
    420   __a[0] = __b;
    421   return __a;
    422 }
    423 
    424 static __inline__ __m128d __DEFAULT_FN_ATTRS
    425 _mm_cvtss_sd(__m128d __a, __m128 __b)
    426 {
    427   __a[0] = __b[0];
    428   return __a;
    429 }
    430 
    431 static __inline__ __m128i __DEFAULT_FN_ATTRS
    432 _mm_cvttpd_epi32(__m128d __a)
    433 {
    434   return (__m128i)__builtin_ia32_cvttpd2dq(__a);
    435 }
    436 
    437 static __inline__ int __DEFAULT_FN_ATTRS
    438 _mm_cvttsd_si32(__m128d __a)
    439 {
    440   return __a[0];
    441 }
    442 
    443 static __inline__ __m64 __DEFAULT_FN_ATTRS
    444 _mm_cvtpd_pi32(__m128d __a)
    445 {
    446   return (__m64)__builtin_ia32_cvtpd2pi(__a);
    447 }
    448 
    449 static __inline__ __m64 __DEFAULT_FN_ATTRS
    450 _mm_cvttpd_pi32(__m128d __a)
    451 {
    452   return (__m64)__builtin_ia32_cvttpd2pi(__a);
    453 }
    454 
    455 static __inline__ __m128d __DEFAULT_FN_ATTRS
    456 _mm_cvtpi32_pd(__m64 __a)
    457 {
    458   return __builtin_ia32_cvtpi2pd((__v2si)__a);
    459 }
    460 
    461 static __inline__ double __DEFAULT_FN_ATTRS
    462 _mm_cvtsd_f64(__m128d __a)
    463 {
    464   return __a[0];
    465 }
    466 
    467 static __inline__ __m128d __DEFAULT_FN_ATTRS
    468 _mm_load_pd(double const *__dp)
    469 {
    470   return *(__m128d*)__dp;
    471 }
    472 
    473 static __inline__ __m128d __DEFAULT_FN_ATTRS
    474 _mm_load1_pd(double const *__dp)
    475 {
    476   struct __mm_load1_pd_struct {
    477     double __u;
    478   } __attribute__((__packed__, __may_alias__));
    479   double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
    480   return (__m128d){ __u, __u };
    481 }
    482 
    483 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
    484 
    485 static __inline__ __m128d __DEFAULT_FN_ATTRS
    486 _mm_loadr_pd(double const *__dp)
    487 {
    488   __m128d __u = *(__m128d*)__dp;
    489   return __builtin_shufflevector(__u, __u, 1, 0);
    490 }
    491 
    492 static __inline__ __m128d __DEFAULT_FN_ATTRS
    493 _mm_loadu_pd(double const *__dp)
    494 {
    495   struct __loadu_pd {
    496     __m128d __v;
    497   } __attribute__((__packed__, __may_alias__));
    498   return ((struct __loadu_pd*)__dp)->__v;
    499 }
    500 
    501 static __inline__ __m128d __DEFAULT_FN_ATTRS
    502 _mm_load_sd(double const *__dp)
    503 {
    504   struct __mm_load_sd_struct {
    505     double __u;
    506   } __attribute__((__packed__, __may_alias__));
    507   double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
    508   return (__m128d){ __u, 0 };
    509 }
    510 
    511 static __inline__ __m128d __DEFAULT_FN_ATTRS
    512 _mm_loadh_pd(__m128d __a, double const *__dp)
    513 {
    514   struct __mm_loadh_pd_struct {
    515     double __u;
    516   } __attribute__((__packed__, __may_alias__));
    517   double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
    518   return (__m128d){ __a[0], __u };
    519 }
    520 
    521 static __inline__ __m128d __DEFAULT_FN_ATTRS
    522 _mm_loadl_pd(__m128d __a, double const *__dp)
    523 {
    524   struct __mm_loadl_pd_struct {
    525     double __u;
    526   } __attribute__((__packed__, __may_alias__));
    527   double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
    528   return (__m128d){ __u, __a[1] };
    529 }
    530 
    531 static __inline__ __m128d __DEFAULT_FN_ATTRS
    532 _mm_undefined_pd()
    533 {
    534   return (__m128d)__builtin_ia32_undef128();
    535 }
    536 
    537 static __inline__ __m128d __DEFAULT_FN_ATTRS
    538 _mm_set_sd(double __w)
    539 {
    540   return (__m128d){ __w, 0 };
    541 }
    542 
    543 static __inline__ __m128d __DEFAULT_FN_ATTRS
    544 _mm_set1_pd(double __w)
    545 {
    546   return (__m128d){ __w, __w };
    547 }
    548 
    549 static __inline__ __m128d __DEFAULT_FN_ATTRS
    550 _mm_set_pd(double __w, double __x)
    551 {
    552   return (__m128d){ __x, __w };
    553 }
    554 
    555 static __inline__ __m128d __DEFAULT_FN_ATTRS
    556 _mm_setr_pd(double __w, double __x)
    557 {
    558   return (__m128d){ __w, __x };
    559 }
    560 
    561 static __inline__ __m128d __DEFAULT_FN_ATTRS
    562 _mm_setzero_pd(void)
    563 {
    564   return (__m128d){ 0, 0 };
    565 }
    566 
    567 static __inline__ __m128d __DEFAULT_FN_ATTRS
    568 _mm_move_sd(__m128d __a, __m128d __b)
    569 {
    570   return (__m128d){ __b[0], __a[1] };
    571 }
    572 
    573 static __inline__ void __DEFAULT_FN_ATTRS
    574 _mm_store_sd(double *__dp, __m128d __a)
    575 {
    576   struct __mm_store_sd_struct {
    577     double __u;
    578   } __attribute__((__packed__, __may_alias__));
    579   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
    580 }
    581 
    582 static __inline__ void __DEFAULT_FN_ATTRS
    583 _mm_store1_pd(double *__dp, __m128d __a)
    584 {
    585   struct __mm_store1_pd_struct {
    586     double __u[2];
    587   } __attribute__((__packed__, __may_alias__));
    588   ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
    589   ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
    590 }
    591 
    592 static __inline__ void __DEFAULT_FN_ATTRS
    593 _mm_store_pd(double *__dp, __m128d __a)
    594 {
    595   *(__m128d *)__dp = __a;
    596 }
    597 
    598 static __inline__ void __DEFAULT_FN_ATTRS
    599 _mm_storeu_pd(double *__dp, __m128d __a)
    600 {
    601   __builtin_ia32_storeupd(__dp, __a);
    602 }
    603 
    604 static __inline__ void __DEFAULT_FN_ATTRS
    605 _mm_storer_pd(double *__dp, __m128d __a)
    606 {
    607   __a = __builtin_shufflevector(__a, __a, 1, 0);
    608   *(__m128d *)__dp = __a;
    609 }
    610 
    611 static __inline__ void __DEFAULT_FN_ATTRS
    612 _mm_storeh_pd(double *__dp, __m128d __a)
    613 {
    614   struct __mm_storeh_pd_struct {
    615     double __u;
    616   } __attribute__((__packed__, __may_alias__));
    617   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
    618 }
    619 
    620 static __inline__ void __DEFAULT_FN_ATTRS
    621 _mm_storel_pd(double *__dp, __m128d __a)
    622 {
    623   struct __mm_storeh_pd_struct {
    624     double __u;
    625   } __attribute__((__packed__, __may_alias__));
    626   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
    627 }
    628 
    629 static __inline__ __m128i __DEFAULT_FN_ATTRS
    630 _mm_add_epi8(__m128i __a, __m128i __b)
    631 {
    632   return (__m128i)((__v16qi)__a + (__v16qi)__b);
    633 }
    634 
    635 static __inline__ __m128i __DEFAULT_FN_ATTRS
    636 _mm_add_epi16(__m128i __a, __m128i __b)
    637 {
    638   return (__m128i)((__v8hi)__a + (__v8hi)__b);
    639 }
    640 
    641 static __inline__ __m128i __DEFAULT_FN_ATTRS
    642 _mm_add_epi32(__m128i __a, __m128i __b)
    643 {
    644   return (__m128i)((__v4si)__a + (__v4si)__b);
    645 }
    646 
    647 static __inline__ __m64 __DEFAULT_FN_ATTRS
    648 _mm_add_si64(__m64 __a, __m64 __b)
    649 {
    650   return (__m64)__builtin_ia32_paddq(__a, __b);
    651 }
    652 
    653 static __inline__ __m128i __DEFAULT_FN_ATTRS
    654 _mm_add_epi64(__m128i __a, __m128i __b)
    655 {
    656   return __a + __b;
    657 }
    658 
    659 static __inline__ __m128i __DEFAULT_FN_ATTRS
    660 _mm_adds_epi8(__m128i __a, __m128i __b)
    661 {
    662   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
    663 }
    664 
    665 static __inline__ __m128i __DEFAULT_FN_ATTRS
    666 _mm_adds_epi16(__m128i __a, __m128i __b)
    667 {
    668   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
    669 }
    670 
    671 static __inline__ __m128i __DEFAULT_FN_ATTRS
    672 _mm_adds_epu8(__m128i __a, __m128i __b)
    673 {
    674   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
    675 }
    676 
    677 static __inline__ __m128i __DEFAULT_FN_ATTRS
    678 _mm_adds_epu16(__m128i __a, __m128i __b)
    679 {
    680   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
    681 }
    682 
    683 static __inline__ __m128i __DEFAULT_FN_ATTRS
    684 _mm_avg_epu8(__m128i __a, __m128i __b)
    685 {
    686   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
    687 }
    688 
    689 static __inline__ __m128i __DEFAULT_FN_ATTRS
    690 _mm_avg_epu16(__m128i __a, __m128i __b)
    691 {
    692   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
    693 }
    694 
    695 static __inline__ __m128i __DEFAULT_FN_ATTRS
    696 _mm_madd_epi16(__m128i __a, __m128i __b)
    697 {
    698   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
    699 }
    700 
    701 static __inline__ __m128i __DEFAULT_FN_ATTRS
    702 _mm_max_epi16(__m128i __a, __m128i __b)
    703 {
    704   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
    705 }
    706 
    707 static __inline__ __m128i __DEFAULT_FN_ATTRS
    708 _mm_max_epu8(__m128i __a, __m128i __b)
    709 {
    710   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
    711 }
    712 
    713 static __inline__ __m128i __DEFAULT_FN_ATTRS
    714 _mm_min_epi16(__m128i __a, __m128i __b)
    715 {
    716   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
    717 }
    718 
    719 static __inline__ __m128i __DEFAULT_FN_ATTRS
    720 _mm_min_epu8(__m128i __a, __m128i __b)
    721 {
    722   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
    723 }
    724 
    725 static __inline__ __m128i __DEFAULT_FN_ATTRS
    726 _mm_mulhi_epi16(__m128i __a, __m128i __b)
    727 {
    728   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
    729 }
    730 
    731 static __inline__ __m128i __DEFAULT_FN_ATTRS
    732 _mm_mulhi_epu16(__m128i __a, __m128i __b)
    733 {
    734   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
    735 }
    736 
    737 static __inline__ __m128i __DEFAULT_FN_ATTRS
    738 _mm_mullo_epi16(__m128i __a, __m128i __b)
    739 {
    740   return (__m128i)((__v8hi)__a * (__v8hi)__b);
    741 }
    742 
    743 static __inline__ __m64 __DEFAULT_FN_ATTRS
    744 _mm_mul_su32(__m64 __a, __m64 __b)
    745 {
    746   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
    747 }
    748 
    749 static __inline__ __m128i __DEFAULT_FN_ATTRS
    750 _mm_mul_epu32(__m128i __a, __m128i __b)
    751 {
    752   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
    753 }
    754 
    755 static __inline__ __m128i __DEFAULT_FN_ATTRS
    756 _mm_sad_epu8(__m128i __a, __m128i __b)
    757 {
    758   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
    759 }
    760 
    761 static __inline__ __m128i __DEFAULT_FN_ATTRS
    762 _mm_sub_epi8(__m128i __a, __m128i __b)
    763 {
    764   return (__m128i)((__v16qi)__a - (__v16qi)__b);
    765 }
    766 
    767 static __inline__ __m128i __DEFAULT_FN_ATTRS
    768 _mm_sub_epi16(__m128i __a, __m128i __b)
    769 {
    770   return (__m128i)((__v8hi)__a - (__v8hi)__b);
    771 }
    772 
    773 static __inline__ __m128i __DEFAULT_FN_ATTRS
    774 _mm_sub_epi32(__m128i __a, __m128i __b)
    775 {
    776   return (__m128i)((__v4si)__a - (__v4si)__b);
    777 }
    778 
    779 static __inline__ __m64 __DEFAULT_FN_ATTRS
    780 _mm_sub_si64(__m64 __a, __m64 __b)
    781 {
    782   return (__m64)__builtin_ia32_psubq(__a, __b);
    783 }
    784 
    785 static __inline__ __m128i __DEFAULT_FN_ATTRS
    786 _mm_sub_epi64(__m128i __a, __m128i __b)
    787 {
    788   return __a - __b;
    789 }
    790 
    791 static __inline__ __m128i __DEFAULT_FN_ATTRS
    792 _mm_subs_epi8(__m128i __a, __m128i __b)
    793 {
    794   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
    795 }
    796 
    797 static __inline__ __m128i __DEFAULT_FN_ATTRS
    798 _mm_subs_epi16(__m128i __a, __m128i __b)
    799 {
    800   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
    801 }
    802 
    803 static __inline__ __m128i __DEFAULT_FN_ATTRS
    804 _mm_subs_epu8(__m128i __a, __m128i __b)
    805 {
    806   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
    807 }
    808 
    809 static __inline__ __m128i __DEFAULT_FN_ATTRS
    810 _mm_subs_epu16(__m128i __a, __m128i __b)
    811 {
    812   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
    813 }
    814 
    815 static __inline__ __m128i __DEFAULT_FN_ATTRS
    816 _mm_and_si128(__m128i __a, __m128i __b)
    817 {
    818   return __a & __b;
    819 }
    820 
    821 static __inline__ __m128i __DEFAULT_FN_ATTRS
    822 _mm_andnot_si128(__m128i __a, __m128i __b)
    823 {
    824   return ~__a & __b;
    825 }
    826 
    827 static __inline__ __m128i __DEFAULT_FN_ATTRS
    828 _mm_or_si128(__m128i __a, __m128i __b)
    829 {
    830   return __a | __b;
    831 }
    832 
    833 static __inline__ __m128i __DEFAULT_FN_ATTRS
    834 _mm_xor_si128(__m128i __a, __m128i __b)
    835 {
    836   return __a ^ __b;
    837 }
    838 
    839 #define _mm_slli_si128(a, imm) __extension__ ({                         \
    840   (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(),        \
    841                                    (__v16qi)(__m128i)(a),               \
    842                                    ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
    843                                    ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
    844                                    ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
    845                                    ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
    846                                    ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
    847                                    ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
    848                                    ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
    849                                    ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
    850                                    ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
    851                                    ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
    852                                    ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
    853                                    ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
    854                                    ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
    855                                    ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
    856                                    ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
    857                                    ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
    858 
    859 #define _mm_bslli_si128(a, imm) \
    860   _mm_slli_si128((a), (imm))
    861 
    862 static __inline__ __m128i __DEFAULT_FN_ATTRS
    863 _mm_slli_epi16(__m128i __a, int __count)
    864 {
    865   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
    866 }
    867 
    868 static __inline__ __m128i __DEFAULT_FN_ATTRS
    869 _mm_sll_epi16(__m128i __a, __m128i __count)
    870 {
    871   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
    872 }
    873 
    874 static __inline__ __m128i __DEFAULT_FN_ATTRS
    875 _mm_slli_epi32(__m128i __a, int __count)
    876 {
    877   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
    878 }
    879 
    880 static __inline__ __m128i __DEFAULT_FN_ATTRS
    881 _mm_sll_epi32(__m128i __a, __m128i __count)
    882 {
    883   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
    884 }
    885 
    886 static __inline__ __m128i __DEFAULT_FN_ATTRS
    887 _mm_slli_epi64(__m128i __a, int __count)
    888 {
    889   return __builtin_ia32_psllqi128(__a, __count);
    890 }
    891 
    892 static __inline__ __m128i __DEFAULT_FN_ATTRS
    893 _mm_sll_epi64(__m128i __a, __m128i __count)
    894 {
    895   return __builtin_ia32_psllq128(__a, __count);
    896 }
    897 
    898 static __inline__ __m128i __DEFAULT_FN_ATTRS
    899 _mm_srai_epi16(__m128i __a, int __count)
    900 {
    901   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
    902 }
    903 
    904 static __inline__ __m128i __DEFAULT_FN_ATTRS
    905 _mm_sra_epi16(__m128i __a, __m128i __count)
    906 {
    907   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
    908 }
    909 
    910 static __inline__ __m128i __DEFAULT_FN_ATTRS
    911 _mm_srai_epi32(__m128i __a, int __count)
    912 {
    913   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
    914 }
    915 
    916 static __inline__ __m128i __DEFAULT_FN_ATTRS
    917 _mm_sra_epi32(__m128i __a, __m128i __count)
    918 {
    919   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
    920 }
    921 
    922 #define _mm_srli_si128(a, imm) __extension__ ({                          \
    923   (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a),                \
    924                                    (__v16qi)_mm_setzero_si128(),         \
    925                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0,  \
    926                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1,  \
    927                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2,  \
    928                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3,  \
    929                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4,  \
    930                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5,  \
    931                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6,  \
    932                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7,  \
    933                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8,  \
    934                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9,  \
    935                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
    936                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
    937                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
    938                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
    939                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
    940                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
    941 
    942 #define _mm_bsrli_si128(a, imm) \
    943   _mm_srli_si128((a), (imm))
    944 
    945 static __inline__ __m128i __DEFAULT_FN_ATTRS
    946 _mm_srli_epi16(__m128i __a, int __count)
    947 {
    948   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
    949 }
    950 
    951 static __inline__ __m128i __DEFAULT_FN_ATTRS
    952 _mm_srl_epi16(__m128i __a, __m128i __count)
    953 {
    954   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
    955 }
    956 
    957 static __inline__ __m128i __DEFAULT_FN_ATTRS
    958 _mm_srli_epi32(__m128i __a, int __count)
    959 {
    960   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
    961 }
    962 
    963 static __inline__ __m128i __DEFAULT_FN_ATTRS
    964 _mm_srl_epi32(__m128i __a, __m128i __count)
    965 {
    966   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
    967 }
    968 
    969 static __inline__ __m128i __DEFAULT_FN_ATTRS
    970 _mm_srli_epi64(__m128i __a, int __count)
    971 {
    972   return __builtin_ia32_psrlqi128(__a, __count);
    973 }
    974 
    975 static __inline__ __m128i __DEFAULT_FN_ATTRS
    976 _mm_srl_epi64(__m128i __a, __m128i __count)
    977 {
    978   return __builtin_ia32_psrlq128(__a, __count);
    979 }
    980 
    981 static __inline__ __m128i __DEFAULT_FN_ATTRS
    982 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
    983 {
    984   return (__m128i)((__v16qi)__a == (__v16qi)__b);
    985 }
    986 
    987 static __inline__ __m128i __DEFAULT_FN_ATTRS
    988 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
    989 {
    990   return (__m128i)((__v8hi)__a == (__v8hi)__b);
    991 }
    992 
    993 static __inline__ __m128i __DEFAULT_FN_ATTRS
    994 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
    995 {
    996   return (__m128i)((__v4si)__a == (__v4si)__b);
    997 }
    998 
    999 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1000 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
   1001 {
   1002   /* This function always performs a signed comparison, but __v16qi is a char
   1003      which may be signed or unsigned, so use __v16qs. */
   1004   return (__m128i)((__v16qs)__a > (__v16qs)__b);
   1005 }
   1006 
   1007 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1008 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
   1009 {
   1010   return (__m128i)((__v8hi)__a > (__v8hi)__b);
   1011 }
   1012 
   1013 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1014 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
   1015 {
   1016   return (__m128i)((__v4si)__a > (__v4si)__b);
   1017 }
   1018 
   1019 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1020 _mm_cmplt_epi8(__m128i __a, __m128i __b)
   1021 {
   1022   return _mm_cmpgt_epi8(__b, __a);
   1023 }
   1024 
   1025 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1026 _mm_cmplt_epi16(__m128i __a, __m128i __b)
   1027 {
   1028   return _mm_cmpgt_epi16(__b, __a);
   1029 }
   1030 
   1031 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1032 _mm_cmplt_epi32(__m128i __a, __m128i __b)
   1033 {
   1034   return _mm_cmpgt_epi32(__b, __a);
   1035 }
   1036 
   1037 #ifdef __x86_64__
   1038 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1039 _mm_cvtsi64_sd(__m128d __a, long long __b)
   1040 {
   1041   __a[0] = __b;
   1042   return __a;
   1043 }
   1044 
   1045 static __inline__ long long __DEFAULT_FN_ATTRS
   1046 _mm_cvtsd_si64(__m128d __a)
   1047 {
   1048   return __builtin_ia32_cvtsd2si64(__a);
   1049 }
   1050 
   1051 static __inline__ long long __DEFAULT_FN_ATTRS
   1052 _mm_cvttsd_si64(__m128d __a)
   1053 {
   1054   return __a[0];
   1055 }
   1056 #endif
   1057 
   1058 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1059 _mm_cvtepi32_ps(__m128i __a)
   1060 {
   1061   return __builtin_ia32_cvtdq2ps((__v4si)__a);
   1062 }
   1063 
   1064 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1065 _mm_cvtps_epi32(__m128 __a)
   1066 {
   1067   return (__m128i)__builtin_ia32_cvtps2dq(__a);
   1068 }
   1069 
   1070 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1071 _mm_cvttps_epi32(__m128 __a)
   1072 {
   1073   return (__m128i)__builtin_ia32_cvttps2dq(__a);
   1074 }
   1075 
   1076 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1077 _mm_cvtsi32_si128(int __a)
   1078 {
   1079   return (__m128i)(__v4si){ __a, 0, 0, 0 };
   1080 }
   1081 
   1082 #ifdef __x86_64__
   1083 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1084 _mm_cvtsi64_si128(long long __a)
   1085 {
   1086   return (__m128i){ __a, 0 };
   1087 }
   1088 #endif
   1089 
   1090 static __inline__ int __DEFAULT_FN_ATTRS
   1091 _mm_cvtsi128_si32(__m128i __a)
   1092 {
   1093   __v4si __b = (__v4si)__a;
   1094   return __b[0];
   1095 }
   1096 
   1097 #ifdef __x86_64__
   1098 static __inline__ long long __DEFAULT_FN_ATTRS
   1099 _mm_cvtsi128_si64(__m128i __a)
   1100 {
   1101   return __a[0];
   1102 }
   1103 #endif
   1104 
   1105 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1106 _mm_load_si128(__m128i const *__p)
   1107 {
   1108   return *__p;
   1109 }
   1110 
   1111 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1112 _mm_loadu_si128(__m128i const *__p)
   1113 {
   1114   struct __loadu_si128 {
   1115     __m128i __v;
   1116   } __attribute__((__packed__, __may_alias__));
   1117   return ((struct __loadu_si128*)__p)->__v;
   1118 }
   1119 
   1120 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1121 _mm_loadl_epi64(__m128i const *__p)
   1122 {
   1123   struct __mm_loadl_epi64_struct {
   1124     long long __u;
   1125   } __attribute__((__packed__, __may_alias__));
   1126   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
   1127 }
   1128 
   1129 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1130 _mm_undefined_si128()
   1131 {
   1132   return (__m128i)__builtin_ia32_undef128();
   1133 }
   1134 
   1135 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1136 _mm_set_epi64x(long long __q1, long long __q0)
   1137 {
   1138   return (__m128i){ __q0, __q1 };
   1139 }
   1140 
   1141 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1142 _mm_set_epi64(__m64 __q1, __m64 __q0)
   1143 {
   1144   return (__m128i){ (long long)__q0, (long long)__q1 };
   1145 }
   1146 
   1147 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1148 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
   1149 {
   1150   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
   1151 }
   1152 
   1153 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1154 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
   1155 {
   1156   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
   1157 }
   1158 
   1159 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1160 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
   1161 {
   1162   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
   1163 }
   1164 
   1165 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1166 _mm_set1_epi64x(long long __q)
   1167 {
   1168   return (__m128i){ __q, __q };
   1169 }
   1170 
   1171 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1172 _mm_set1_epi64(__m64 __q)
   1173 {
   1174   return (__m128i){ (long long)__q, (long long)__q };
   1175 }
   1176 
   1177 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1178 _mm_set1_epi32(int __i)
   1179 {
   1180   return (__m128i)(__v4si){ __i, __i, __i, __i };
   1181 }
   1182 
   1183 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1184 _mm_set1_epi16(short __w)
   1185 {
   1186   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
   1187 }
   1188 
   1189 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1190 _mm_set1_epi8(char __b)
   1191 {
   1192   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
   1193 }
   1194 
   1195 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1196 _mm_setr_epi64(__m64 __q0, __m64 __q1)
   1197 {
   1198   return (__m128i){ (long long)__q0, (long long)__q1 };
   1199 }
   1200 
   1201 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1202 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
   1203 {
   1204   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
   1205 }
   1206 
   1207 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1208 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
   1209 {
   1210   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
   1211 }
   1212 
   1213 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1214 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
   1215 {
   1216   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
   1217 }
   1218 
   1219 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1220 _mm_setzero_si128(void)
   1221 {
   1222   return (__m128i){ 0LL, 0LL };
   1223 }
   1224 
   1225 static __inline__ void __DEFAULT_FN_ATTRS
   1226 _mm_store_si128(__m128i *__p, __m128i __b)
   1227 {
   1228   *__p = __b;
   1229 }
   1230 
   1231 static __inline__ void __DEFAULT_FN_ATTRS
   1232 _mm_storeu_si128(__m128i *__p, __m128i __b)
   1233 {
   1234   __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
   1235 }
   1236 
   1237 static __inline__ void __DEFAULT_FN_ATTRS
   1238 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
   1239 {
   1240   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
   1241 }
   1242 
   1243 static __inline__ void __DEFAULT_FN_ATTRS
   1244 _mm_storel_epi64(__m128i *__p, __m128i __a)
   1245 {
   1246   struct __mm_storel_epi64_struct {
   1247     long long __u;
   1248   } __attribute__((__packed__, __may_alias__));
   1249   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
   1250 }
   1251 
   1252 static __inline__ void __DEFAULT_FN_ATTRS
   1253 _mm_stream_pd(double *__p, __m128d __a)
   1254 {
   1255   __builtin_ia32_movntpd(__p, __a);
   1256 }
   1257 
   1258 static __inline__ void __DEFAULT_FN_ATTRS
   1259 _mm_stream_si128(__m128i *__p, __m128i __a)
   1260 {
   1261   __builtin_ia32_movntdq(__p, __a);
   1262 }
   1263 
   1264 static __inline__ void __DEFAULT_FN_ATTRS
   1265 _mm_stream_si32(int *__p, int __a)
   1266 {
   1267   __builtin_ia32_movnti(__p, __a);
   1268 }
   1269 
   1270 #ifdef __x86_64__
   1271 static __inline__ void __DEFAULT_FN_ATTRS
   1272 _mm_stream_si64(long long *__p, long long __a)
   1273 {
   1274   __builtin_ia32_movnti64(__p, __a);
   1275 }
   1276 #endif
   1277 
   1278 static __inline__ void __DEFAULT_FN_ATTRS
   1279 _mm_clflush(void const *__p)
   1280 {
   1281   __builtin_ia32_clflush(__p);
   1282 }
   1283 
   1284 static __inline__ void __DEFAULT_FN_ATTRS
   1285 _mm_lfence(void)
   1286 {
   1287   __builtin_ia32_lfence();
   1288 }
   1289 
   1290 static __inline__ void __DEFAULT_FN_ATTRS
   1291 _mm_mfence(void)
   1292 {
   1293   __builtin_ia32_mfence();
   1294 }
   1295 
   1296 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1297 _mm_packs_epi16(__m128i __a, __m128i __b)
   1298 {
   1299   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
   1300 }
   1301 
   1302 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1303 _mm_packs_epi32(__m128i __a, __m128i __b)
   1304 {
   1305   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
   1306 }
   1307 
   1308 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1309 _mm_packus_epi16(__m128i __a, __m128i __b)
   1310 {
   1311   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
   1312 }
   1313 
   1314 static __inline__ int __DEFAULT_FN_ATTRS
   1315 _mm_extract_epi16(__m128i __a, int __imm)
   1316 {
   1317   __v8hi __b = (__v8hi)__a;
   1318   return (unsigned short)__b[__imm & 7];
   1319 }
   1320 
   1321 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1322 _mm_insert_epi16(__m128i __a, int __b, int __imm)
   1323 {
   1324   __v8hi __c = (__v8hi)__a;
   1325   __c[__imm & 7] = __b;
   1326   return (__m128i)__c;
   1327 }
   1328 
   1329 static __inline__ int __DEFAULT_FN_ATTRS
   1330 _mm_movemask_epi8(__m128i __a)
   1331 {
   1332   return __builtin_ia32_pmovmskb128((__v16qi)__a);
   1333 }
   1334 
   1335 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
   1336   (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
   1337                                    (__v4si)_mm_setzero_si128(), \
   1338                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
   1339                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
   1340 
   1341 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
   1342   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
   1343                                    (__v8hi)_mm_setzero_si128(), \
   1344                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
   1345                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
   1346                                    4, 5, 6, 7); })
   1347 
   1348 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
   1349   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
   1350                                    (__v8hi)_mm_setzero_si128(), \
   1351                                    0, 1, 2, 3, \
   1352                                    4 + (((imm) & 0x03) >> 0), \
   1353                                    4 + (((imm) & 0x0c) >> 2), \
   1354                                    4 + (((imm) & 0x30) >> 4), \
   1355                                    4 + (((imm) & 0xc0) >> 6)); })
   1356 
   1357 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1358 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
   1359 {
   1360   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
   1361 }
   1362 
   1363 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1364 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
   1365 {
   1366   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
   1367 }
   1368 
   1369 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1370 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
   1371 {
   1372   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
   1373 }
   1374 
   1375 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1376 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
   1377 {
   1378   return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
   1379 }
   1380 
   1381 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1382 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
   1383 {
   1384   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
   1385 }
   1386 
   1387 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1388 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
   1389 {
   1390   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
   1391 }
   1392 
   1393 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1394 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
   1395 {
   1396   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
   1397 }
   1398 
   1399 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1400 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
   1401 {
   1402   return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
   1403 }
   1404 
   1405 static __inline__ __m64 __DEFAULT_FN_ATTRS
   1406 _mm_movepi64_pi64(__m128i __a)
   1407 {
   1408   return (__m64)__a[0];
   1409 }
   1410 
   1411 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1412 _mm_movpi64_epi64(__m64 __a)
   1413 {
   1414   return (__m128i){ (long long)__a, 0 };
   1415 }
   1416 
   1417 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1418 _mm_move_epi64(__m128i __a)
   1419 {
   1420   return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
   1421 }
   1422 
   1423 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1424 _mm_unpackhi_pd(__m128d __a, __m128d __b)
   1425 {
   1426   return __builtin_shufflevector(__a, __b, 1, 2+1);
   1427 }
   1428 
   1429 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1430 _mm_unpacklo_pd(__m128d __a, __m128d __b)
   1431 {
   1432   return __builtin_shufflevector(__a, __b, 0, 2+0);
   1433 }
   1434 
   1435 static __inline__ int __DEFAULT_FN_ATTRS
   1436 _mm_movemask_pd(__m128d __a)
   1437 {
   1438   return __builtin_ia32_movmskpd(__a);
   1439 }
   1440 
   1441 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
   1442   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
   1443                                    (i) & 1, (((i) & 2) >> 1) + 2); })
   1444 
   1445 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1446 _mm_castpd_ps(__m128d __a)
   1447 {
   1448   return (__m128)__a;
   1449 }
   1450 
   1451 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1452 _mm_castpd_si128(__m128d __a)
   1453 {
   1454   return (__m128i)__a;
   1455 }
   1456 
   1457 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1458 _mm_castps_pd(__m128 __a)
   1459 {
   1460   return (__m128d)__a;
   1461 }
   1462 
   1463 static __inline__ __m128i __DEFAULT_FN_ATTRS
   1464 _mm_castps_si128(__m128 __a)
   1465 {
   1466   return (__m128i)__a;
   1467 }
   1468 
   1469 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1470 _mm_castsi128_ps(__m128i __a)
   1471 {
   1472   return (__m128)__a;
   1473 }
   1474 
   1475 static __inline__ __m128d __DEFAULT_FN_ATTRS
   1476 _mm_castsi128_pd(__m128i __a)
   1477 {
   1478   return (__m128d)__a;
   1479 }
   1480 
   1481 static __inline__ void __DEFAULT_FN_ATTRS
   1482 _mm_pause(void)
   1483 {
   1484   __builtin_ia32_pause();
   1485 }
   1486 
   1487 #undef __DEFAULT_FN_ATTRS
   1488 
   1489 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
   1490 
   1491 #endif /* __EMMINTRIN_H */
   1492