Home | History | Annotate | Download | only in clang-include
      1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __EMMINTRIN_H
     25 #define __EMMINTRIN_H
     26 
     27 #ifndef __SSE2__
     28 #error "SSE2 instruction set not enabled"
     29 #else
     30 
     31 #include <xmmintrin.h>
     32 
     33 typedef double __m128d __attribute__((__vector_size__(16)));
     34 typedef long long __m128i __attribute__((__vector_size__(16)));
     35 
     36 /* Type defines.  */
     37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
     38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
     39 typedef short __v8hi __attribute__((__vector_size__(16)));
     40 typedef char __v16qi __attribute__((__vector_size__(16)));
     41 
     42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     43 _mm_add_sd(__m128d __a, __m128d __b)
     44 {
     45   __a[0] += __b[0];
     46   return __a;
     47 }
     48 
     49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     50 _mm_add_pd(__m128d __a, __m128d __b)
     51 {
     52   return __a + __b;
     53 }
     54 
     55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     56 _mm_sub_sd(__m128d __a, __m128d __b)
     57 {
     58   __a[0] -= __b[0];
     59   return __a;
     60 }
     61 
     62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     63 _mm_sub_pd(__m128d __a, __m128d __b)
     64 {
     65   return __a - __b;
     66 }
     67 
     68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     69 _mm_mul_sd(__m128d __a, __m128d __b)
     70 {
     71   __a[0] *= __b[0];
     72   return __a;
     73 }
     74 
     75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     76 _mm_mul_pd(__m128d __a, __m128d __b)
     77 {
     78   return __a * __b;
     79 }
     80 
     81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     82 _mm_div_sd(__m128d __a, __m128d __b)
     83 {
     84   __a[0] /= __b[0];
     85   return __a;
     86 }
     87 
     88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     89 _mm_div_pd(__m128d __a, __m128d __b)
     90 {
     91   return __a / __b;
     92 }
     93 
     94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     95 _mm_sqrt_sd(__m128d __a, __m128d __b)
     96 {
     97   __m128d __c = __builtin_ia32_sqrtsd(__b);
     98   return (__m128d) { __c[0], __a[1] };
     99 }
    100 
    101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    102 _mm_sqrt_pd(__m128d __a)
    103 {
    104   return __builtin_ia32_sqrtpd(__a);
    105 }
    106 
    107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    108 _mm_min_sd(__m128d __a, __m128d __b)
    109 {
    110   return __builtin_ia32_minsd(__a, __b);
    111 }
    112 
    113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    114 _mm_min_pd(__m128d __a, __m128d __b)
    115 {
    116   return __builtin_ia32_minpd(__a, __b);
    117 }
    118 
    119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    120 _mm_max_sd(__m128d __a, __m128d __b)
    121 {
    122   return __builtin_ia32_maxsd(__a, __b);
    123 }
    124 
    125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    126 _mm_max_pd(__m128d __a, __m128d __b)
    127 {
    128   return __builtin_ia32_maxpd(__a, __b);
    129 }
    130 
    131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    132 _mm_and_pd(__m128d __a, __m128d __b)
    133 {
    134   return (__m128d)((__v4si)__a & (__v4si)__b);
    135 }
    136 
    137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    138 _mm_andnot_pd(__m128d __a, __m128d __b)
    139 {
    140   return (__m128d)(~(__v4si)__a & (__v4si)__b);
    141 }
    142 
    143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    144 _mm_or_pd(__m128d __a, __m128d __b)
    145 {
    146   return (__m128d)((__v4si)__a | (__v4si)__b);
    147 }
    148 
    149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    150 _mm_xor_pd(__m128d __a, __m128d __b)
    151 {
    152   return (__m128d)((__v4si)__a ^ (__v4si)__b);
    153 }
    154 
    155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    156 _mm_cmpeq_pd(__m128d __a, __m128d __b)
    157 {
    158   return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
    159 }
    160 
    161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    162 _mm_cmplt_pd(__m128d __a, __m128d __b)
    163 {
    164   return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
    165 }
    166 
    167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    168 _mm_cmple_pd(__m128d __a, __m128d __b)
    169 {
    170   return (__m128d)__builtin_ia32_cmplepd(__a, __b);
    171 }
    172 
    173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    174 _mm_cmpgt_pd(__m128d __a, __m128d __b)
    175 {
    176   return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
    177 }
    178 
    179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    180 _mm_cmpge_pd(__m128d __a, __m128d __b)
    181 {
    182   return (__m128d)__builtin_ia32_cmplepd(__b, __a);
    183 }
    184 
    185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    186 _mm_cmpord_pd(__m128d __a, __m128d __b)
    187 {
    188   return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
    189 }
    190 
    191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    192 _mm_cmpunord_pd(__m128d __a, __m128d __b)
    193 {
    194   return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
    195 }
    196 
    197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    198 _mm_cmpneq_pd(__m128d __a, __m128d __b)
    199 {
    200   return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
    201 }
    202 
    203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    204 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
    205 {
    206   return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
    207 }
    208 
    209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    210 _mm_cmpnle_pd(__m128d __a, __m128d __b)
    211 {
    212   return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
    213 }
    214 
    215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    216 _mm_cmpngt_pd(__m128d __a, __m128d __b)
    217 {
    218   return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
    219 }
    220 
    221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    222 _mm_cmpnge_pd(__m128d __a, __m128d __b)
    223 {
    224   return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
    225 }
    226 
    227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    228 _mm_cmpeq_sd(__m128d __a, __m128d __b)
    229 {
    230   return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
    231 }
    232 
    233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    234 _mm_cmplt_sd(__m128d __a, __m128d __b)
    235 {
    236   return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
    237 }
    238 
    239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    240 _mm_cmple_sd(__m128d __a, __m128d __b)
    241 {
    242   return (__m128d)__builtin_ia32_cmplesd(__a, __b);
    243 }
    244 
    245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    246 _mm_cmpgt_sd(__m128d __a, __m128d __b)
    247 {
    248   __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
    249   return (__m128d) { __c[0], __a[1] };
    250 }
    251 
    252 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    253 _mm_cmpge_sd(__m128d __a, __m128d __b)
    254 {
    255   __m128d __c = __builtin_ia32_cmplesd(__b, __a);
    256   return (__m128d) { __c[0], __a[1] };
    257 }
    258 
    259 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    260 _mm_cmpord_sd(__m128d __a, __m128d __b)
    261 {
    262   return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
    263 }
    264 
    265 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    266 _mm_cmpunord_sd(__m128d __a, __m128d __b)
    267 {
    268   return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
    269 }
    270 
    271 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    272 _mm_cmpneq_sd(__m128d __a, __m128d __b)
    273 {
    274   return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
    275 }
    276 
    277 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    278 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
    279 {
    280   return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
    281 }
    282 
    283 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    284 _mm_cmpnle_sd(__m128d __a, __m128d __b)
    285 {
    286   return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
    287 }
    288 
    289 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    290 _mm_cmpngt_sd(__m128d __a, __m128d __b)
    291 {
    292   __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
    293   return (__m128d) { __c[0], __a[1] };
    294 }
    295 
    296 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    297 _mm_cmpnge_sd(__m128d __a, __m128d __b)
    298 {
    299   __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
    300   return (__m128d) { __c[0], __a[1] };
    301 }
    302 
    303 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    304 _mm_comieq_sd(__m128d __a, __m128d __b)
    305 {
    306   return __builtin_ia32_comisdeq(__a, __b);
    307 }
    308 
    309 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    310 _mm_comilt_sd(__m128d __a, __m128d __b)
    311 {
    312   return __builtin_ia32_comisdlt(__a, __b);
    313 }
    314 
    315 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    316 _mm_comile_sd(__m128d __a, __m128d __b)
    317 {
    318   return __builtin_ia32_comisdle(__a, __b);
    319 }
    320 
    321 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    322 _mm_comigt_sd(__m128d __a, __m128d __b)
    323 {
    324   return __builtin_ia32_comisdgt(__a, __b);
    325 }
    326 
    327 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    328 _mm_comige_sd(__m128d __a, __m128d __b)
    329 {
    330   return __builtin_ia32_comisdge(__a, __b);
    331 }
    332 
    333 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    334 _mm_comineq_sd(__m128d __a, __m128d __b)
    335 {
    336   return __builtin_ia32_comisdneq(__a, __b);
    337 }
    338 
    339 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    340 _mm_ucomieq_sd(__m128d __a, __m128d __b)
    341 {
    342   return __builtin_ia32_ucomisdeq(__a, __b);
    343 }
    344 
    345 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    346 _mm_ucomilt_sd(__m128d __a, __m128d __b)
    347 {
    348   return __builtin_ia32_ucomisdlt(__a, __b);
    349 }
    350 
    351 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    352 _mm_ucomile_sd(__m128d __a, __m128d __b)
    353 {
    354   return __builtin_ia32_ucomisdle(__a, __b);
    355 }
    356 
    357 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    358 _mm_ucomigt_sd(__m128d __a, __m128d __b)
    359 {
    360   return __builtin_ia32_ucomisdgt(__a, __b);
    361 }
    362 
    363 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    364 _mm_ucomige_sd(__m128d __a, __m128d __b)
    365 {
    366   return __builtin_ia32_ucomisdge(__a, __b);
    367 }
    368 
    369 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    370 _mm_ucomineq_sd(__m128d __a, __m128d __b)
    371 {
    372   return __builtin_ia32_ucomisdneq(__a, __b);
    373 }
    374 
    375 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    376 _mm_cvtpd_ps(__m128d __a)
    377 {
    378   return __builtin_ia32_cvtpd2ps(__a);
    379 }
    380 
    381 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    382 _mm_cvtps_pd(__m128 __a)
    383 {
    384   return __builtin_ia32_cvtps2pd(__a);
    385 }
    386 
    387 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    388 _mm_cvtepi32_pd(__m128i __a)
    389 {
    390   return __builtin_ia32_cvtdq2pd((__v4si)__a);
    391 }
    392 
    393 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    394 _mm_cvtpd_epi32(__m128d __a)
    395 {
    396   return __builtin_ia32_cvtpd2dq(__a);
    397 }
    398 
    399 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    400 _mm_cvtsd_si32(__m128d __a)
    401 {
    402   return __builtin_ia32_cvtsd2si(__a);
    403 }
    404 
    405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    406 _mm_cvtsd_ss(__m128 __a, __m128d __b)
    407 {
    408   __a[0] = __b[0];
    409   return __a;
    410 }
    411 
    412 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    413 _mm_cvtsi32_sd(__m128d __a, int __b)
    414 {
    415   __a[0] = __b;
    416   return __a;
    417 }
    418 
    419 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    420 _mm_cvtss_sd(__m128d __a, __m128 __b)
    421 {
    422   __a[0] = __b[0];
    423   return __a;
    424 }
    425 
    426 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    427 _mm_cvttpd_epi32(__m128d __a)
    428 {
    429   return (__m128i)__builtin_ia32_cvttpd2dq(__a);
    430 }
    431 
    432 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    433 _mm_cvttsd_si32(__m128d __a)
    434 {
    435   return __a[0];
    436 }
    437 
    438 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    439 _mm_cvtpd_pi32(__m128d __a)
    440 {
    441   return (__m64)__builtin_ia32_cvtpd2pi(__a);
    442 }
    443 
    444 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    445 _mm_cvttpd_pi32(__m128d __a)
    446 {
    447   return (__m64)__builtin_ia32_cvttpd2pi(__a);
    448 }
    449 
    450 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    451 _mm_cvtpi32_pd(__m64 __a)
    452 {
    453   return __builtin_ia32_cvtpi2pd((__v2si)__a);
    454 }
    455 
    456 static __inline__ double __attribute__((__always_inline__, __nodebug__))
    457 _mm_cvtsd_f64(__m128d __a)
    458 {
    459   return __a[0];
    460 }
    461 
    462 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    463 _mm_load_pd(double const *__dp)
    464 {
    465   return *(__m128d*)__dp;
    466 }
    467 
    468 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    469 _mm_load1_pd(double const *__dp)
    470 {
    471   struct __mm_load1_pd_struct {
    472     double __u;
    473   } __attribute__((__packed__, __may_alias__));
    474   double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
    475   return (__m128d){ __u, __u };
    476 }
    477 
    478 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
    479 
    480 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    481 _mm_loadr_pd(double const *__dp)
    482 {
    483   __m128d __u = *(__m128d*)__dp;
    484   return __builtin_shufflevector(__u, __u, 1, 0);
    485 }
    486 
    487 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    488 _mm_loadu_pd(double const *__dp)
    489 {
    490   struct __loadu_pd {
    491     __m128d __v;
    492   } __attribute__((__packed__, __may_alias__));
    493   return ((struct __loadu_pd*)__dp)->__v;
    494 }
    495 
    496 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    497 _mm_load_sd(double const *__dp)
    498 {
    499   struct __mm_load_sd_struct {
    500     double __u;
    501   } __attribute__((__packed__, __may_alias__));
    502   double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
    503   return (__m128d){ __u, 0 };
    504 }
    505 
    506 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    507 _mm_loadh_pd(__m128d __a, double const *__dp)
    508 {
    509   struct __mm_loadh_pd_struct {
    510     double __u;
    511   } __attribute__((__packed__, __may_alias__));
    512   double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
    513   return (__m128d){ __a[0], __u };
    514 }
    515 
    516 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    517 _mm_loadl_pd(__m128d __a, double const *__dp)
    518 {
    519   struct __mm_loadl_pd_struct {
    520     double __u;
    521   } __attribute__((__packed__, __may_alias__));
    522   double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
    523   return (__m128d){ __u, __a[1] };
    524 }
    525 
    526 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    527 _mm_set_sd(double __w)
    528 {
    529   return (__m128d){ __w, 0 };
    530 }
    531 
    532 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    533 _mm_set1_pd(double __w)
    534 {
    535   return (__m128d){ __w, __w };
    536 }
    537 
    538 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    539 _mm_set_pd(double __w, double __x)
    540 {
    541   return (__m128d){ __x, __w };
    542 }
    543 
    544 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    545 _mm_setr_pd(double __w, double __x)
    546 {
    547   return (__m128d){ __w, __x };
    548 }
    549 
    550 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    551 _mm_setzero_pd(void)
    552 {
    553   return (__m128d){ 0, 0 };
    554 }
    555 
    556 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    557 _mm_move_sd(__m128d __a, __m128d __b)
    558 {
    559   return (__m128d){ __b[0], __a[1] };
    560 }
    561 
    562 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    563 _mm_store_sd(double *__dp, __m128d __a)
    564 {
    565   struct __mm_store_sd_struct {
    566     double __u;
    567   } __attribute__((__packed__, __may_alias__));
    568   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
    569 }
    570 
    571 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    572 _mm_store1_pd(double *__dp, __m128d __a)
    573 {
    574   struct __mm_store1_pd_struct {
    575     double __u[2];
    576   } __attribute__((__packed__, __may_alias__));
    577   ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
    578   ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
    579 }
    580 
    581 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    582 _mm_store_pd(double *__dp, __m128d __a)
    583 {
    584   *(__m128d *)__dp = __a;
    585 }
    586 
    587 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    588 _mm_storeu_pd(double *__dp, __m128d __a)
    589 {
    590   __builtin_ia32_storeupd(__dp, __a);
    591 }
    592 
    593 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    594 _mm_storer_pd(double *__dp, __m128d __a)
    595 {
    596   __a = __builtin_shufflevector(__a, __a, 1, 0);
    597   *(__m128d *)__dp = __a;
    598 }
    599 
    600 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    601 _mm_storeh_pd(double *__dp, __m128d __a)
    602 {
    603   struct __mm_storeh_pd_struct {
    604     double __u;
    605   } __attribute__((__packed__, __may_alias__));
    606   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
    607 }
    608 
    609 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    610 _mm_storel_pd(double *__dp, __m128d __a)
    611 {
    612   struct __mm_storeh_pd_struct {
    613     double __u;
    614   } __attribute__((__packed__, __may_alias__));
    615   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
    616 }
    617 
    618 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    619 _mm_add_epi8(__m128i __a, __m128i __b)
    620 {
    621   return (__m128i)((__v16qi)__a + (__v16qi)__b);
    622 }
    623 
    624 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    625 _mm_add_epi16(__m128i __a, __m128i __b)
    626 {
    627   return (__m128i)((__v8hi)__a + (__v8hi)__b);
    628 }
    629 
    630 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    631 _mm_add_epi32(__m128i __a, __m128i __b)
    632 {
    633   return (__m128i)((__v4si)__a + (__v4si)__b);
    634 }
    635 
    636 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    637 _mm_add_si64(__m64 __a, __m64 __b)
    638 {
    639   return __a + __b;
    640 }
    641 
    642 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    643 _mm_add_epi64(__m128i __a, __m128i __b)
    644 {
    645   return __a + __b;
    646 }
    647 
    648 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    649 _mm_adds_epi8(__m128i __a, __m128i __b)
    650 {
    651   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
    652 }
    653 
    654 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    655 _mm_adds_epi16(__m128i __a, __m128i __b)
    656 {
    657   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
    658 }
    659 
    660 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    661 _mm_adds_epu8(__m128i __a, __m128i __b)
    662 {
    663   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
    664 }
    665 
    666 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    667 _mm_adds_epu16(__m128i __a, __m128i __b)
    668 {
    669   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
    670 }
    671 
    672 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    673 _mm_avg_epu8(__m128i __a, __m128i __b)
    674 {
    675   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
    676 }
    677 
    678 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    679 _mm_avg_epu16(__m128i __a, __m128i __b)
    680 {
    681   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
    682 }
    683 
    684 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    685 _mm_madd_epi16(__m128i __a, __m128i __b)
    686 {
    687   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
    688 }
    689 
    690 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    691 _mm_max_epi16(__m128i __a, __m128i __b)
    692 {
    693   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
    694 }
    695 
    696 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    697 _mm_max_epu8(__m128i __a, __m128i __b)
    698 {
    699   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
    700 }
    701 
    702 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    703 _mm_min_epi16(__m128i __a, __m128i __b)
    704 {
    705   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
    706 }
    707 
    708 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    709 _mm_min_epu8(__m128i __a, __m128i __b)
    710 {
    711   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
    712 }
    713 
    714 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    715 _mm_mulhi_epi16(__m128i __a, __m128i __b)
    716 {
    717   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
    718 }
    719 
    720 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    721 _mm_mulhi_epu16(__m128i __a, __m128i __b)
    722 {
    723   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
    724 }
    725 
    726 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    727 _mm_mullo_epi16(__m128i __a, __m128i __b)
    728 {
    729   return (__m128i)((__v8hi)__a * (__v8hi)__b);
    730 }
    731 
    732 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    733 _mm_mul_su32(__m64 __a, __m64 __b)
    734 {
    735   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
    736 }
    737 
    738 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    739 _mm_mul_epu32(__m128i __a, __m128i __b)
    740 {
    741   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
    742 }
    743 
    744 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    745 _mm_sad_epu8(__m128i __a, __m128i __b)
    746 {
    747   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
    748 }
    749 
    750 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    751 _mm_sub_epi8(__m128i __a, __m128i __b)
    752 {
    753   return (__m128i)((__v16qi)__a - (__v16qi)__b);
    754 }
    755 
    756 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    757 _mm_sub_epi16(__m128i __a, __m128i __b)
    758 {
    759   return (__m128i)((__v8hi)__a - (__v8hi)__b);
    760 }
    761 
    762 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    763 _mm_sub_epi32(__m128i __a, __m128i __b)
    764 {
    765   return (__m128i)((__v4si)__a - (__v4si)__b);
    766 }
    767 
    768 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    769 _mm_sub_si64(__m64 __a, __m64 __b)
    770 {
    771   return __a - __b;
    772 }
    773 
    774 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    775 _mm_sub_epi64(__m128i __a, __m128i __b)
    776 {
    777   return __a - __b;
    778 }
    779 
    780 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    781 _mm_subs_epi8(__m128i __a, __m128i __b)
    782 {
    783   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
    784 }
    785 
    786 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    787 _mm_subs_epi16(__m128i __a, __m128i __b)
    788 {
    789   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
    790 }
    791 
    792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    793 _mm_subs_epu8(__m128i __a, __m128i __b)
    794 {
    795   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
    796 }
    797 
    798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    799 _mm_subs_epu16(__m128i __a, __m128i __b)
    800 {
    801   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
    802 }
    803 
    804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    805 _mm_and_si128(__m128i __a, __m128i __b)
    806 {
    807   return __a & __b;
    808 }
    809 
    810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    811 _mm_andnot_si128(__m128i __a, __m128i __b)
    812 {
    813   return ~__a & __b;
    814 }
    815 
    816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    817 _mm_or_si128(__m128i __a, __m128i __b)
    818 {
    819   return __a | __b;
    820 }
    821 
    822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    823 _mm_xor_si128(__m128i __a, __m128i __b)
    824 {
    825   return __a ^ __b;
    826 }
    827 
    828 #define _mm_slli_si128(a, imm) __extension__ ({                         \
    829   (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(),        \
    830                                    (__v16qi)(__m128i)(a),               \
    831                                    ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
    832                                    ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
    833                                    ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
    834                                    ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
    835                                    ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
    836                                    ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
    837                                    ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
    838                                    ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
    839                                    ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
    840                                    ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
    841                                    ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
    842                                    ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
    843                                    ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
    844                                    ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
    845                                    ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
    846                                    ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
    847 
    848 #define _mm_bslli_si128(a, imm) \
    849   _mm_slli_si128((a), (imm))
    850 
    851 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    852 _mm_slli_epi16(__m128i __a, int __count)
    853 {
    854   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
    855 }
    856 
    857 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    858 _mm_sll_epi16(__m128i __a, __m128i __count)
    859 {
    860   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
    861 }
    862 
    863 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    864 _mm_slli_epi32(__m128i __a, int __count)
    865 {
    866   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
    867 }
    868 
    869 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    870 _mm_sll_epi32(__m128i __a, __m128i __count)
    871 {
    872   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
    873 }
    874 
    875 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    876 _mm_slli_epi64(__m128i __a, int __count)
    877 {
    878   return __builtin_ia32_psllqi128(__a, __count);
    879 }
    880 
    881 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    882 _mm_sll_epi64(__m128i __a, __m128i __count)
    883 {
    884   return __builtin_ia32_psllq128(__a, __count);
    885 }
    886 
    887 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    888 _mm_srai_epi16(__m128i __a, int __count)
    889 {
    890   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
    891 }
    892 
    893 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    894 _mm_sra_epi16(__m128i __a, __m128i __count)
    895 {
    896   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
    897 }
    898 
    899 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    900 _mm_srai_epi32(__m128i __a, int __count)
    901 {
    902   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
    903 }
    904 
    905 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    906 _mm_sra_epi32(__m128i __a, __m128i __count)
    907 {
    908   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
    909 }
    910 
    911 #define _mm_srli_si128(a, imm) __extension__ ({                          \
    912   (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a),                \
    913                                    (__v16qi)_mm_setzero_si128(),         \
    914                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0,  \
    915                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1,  \
    916                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2,  \
    917                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3,  \
    918                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4,  \
    919                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5,  \
    920                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6,  \
    921                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7,  \
    922                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8,  \
    923                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9,  \
    924                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
    925                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
    926                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
    927                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
    928                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
    929                                    ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
    930 
    931 #define _mm_bsrli_si128(a, imm) \
    932   _mm_srli_si128((a), (imm))
    933 
    934 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    935 _mm_srli_epi16(__m128i __a, int __count)
    936 {
    937   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
    938 }
    939 
    940 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    941 _mm_srl_epi16(__m128i __a, __m128i __count)
    942 {
    943   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
    944 }
    945 
    946 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    947 _mm_srli_epi32(__m128i __a, int __count)
    948 {
    949   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
    950 }
    951 
    952 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    953 _mm_srl_epi32(__m128i __a, __m128i __count)
    954 {
    955   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
    956 }
    957 
    958 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    959 _mm_srli_epi64(__m128i __a, int __count)
    960 {
    961   return __builtin_ia32_psrlqi128(__a, __count);
    962 }
    963 
    964 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    965 _mm_srl_epi64(__m128i __a, __m128i __count)
    966 {
    967   return __builtin_ia32_psrlq128(__a, __count);
    968 }
    969 
    970 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    971 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
    972 {
    973   return (__m128i)((__v16qi)__a == (__v16qi)__b);
    974 }
    975 
    976 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    977 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
    978 {
    979   return (__m128i)((__v8hi)__a == (__v8hi)__b);
    980 }
    981 
    982 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    983 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
    984 {
    985   return (__m128i)((__v4si)__a == (__v4si)__b);
    986 }
    987 
    988 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    989 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
    990 {
    991   /* This function always performs a signed comparison, but __v16qi is a char
    992      which may be signed or unsigned. */
    993   typedef signed char __v16qs __attribute__((__vector_size__(16)));
    994   return (__m128i)((__v16qs)__a > (__v16qs)__b);
    995 }
    996 
    997 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    998 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
    999 {
   1000   return (__m128i)((__v8hi)__a > (__v8hi)__b);
   1001 }
   1002 
   1003 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1004 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
   1005 {
   1006   return (__m128i)((__v4si)__a > (__v4si)__b);
   1007 }
   1008 
   1009 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1010 _mm_cmplt_epi8(__m128i __a, __m128i __b)
   1011 {
   1012   return _mm_cmpgt_epi8(__b, __a);
   1013 }
   1014 
   1015 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1016 _mm_cmplt_epi16(__m128i __a, __m128i __b)
   1017 {
   1018   return _mm_cmpgt_epi16(__b, __a);
   1019 }
   1020 
   1021 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1022 _mm_cmplt_epi32(__m128i __a, __m128i __b)
   1023 {
   1024   return _mm_cmpgt_epi32(__b, __a);
   1025 }
   1026 
   1027 #ifdef __x86_64__
   1028 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
   1029 _mm_cvtsi64_sd(__m128d __a, long long __b)
   1030 {
   1031   __a[0] = __b;
   1032   return __a;
   1033 }
   1034 
   1035 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
   1036 _mm_cvtsd_si64(__m128d __a)
   1037 {
   1038   return __builtin_ia32_cvtsd2si64(__a);
   1039 }
   1040 
   1041 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
   1042 _mm_cvttsd_si64(__m128d __a)
   1043 {
   1044   return __a[0];
   1045 }
   1046 #endif
   1047 
   1048 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
   1049 _mm_cvtepi32_ps(__m128i __a)
   1050 {
   1051   return __builtin_ia32_cvtdq2ps((__v4si)__a);
   1052 }
   1053 
   1054 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1055 _mm_cvtps_epi32(__m128 __a)
   1056 {
   1057   return (__m128i)__builtin_ia32_cvtps2dq(__a);
   1058 }
   1059 
   1060 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1061 _mm_cvttps_epi32(__m128 __a)
   1062 {
   1063   return (__m128i)__builtin_ia32_cvttps2dq(__a);
   1064 }
   1065 
   1066 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1067 _mm_cvtsi32_si128(int __a)
   1068 {
   1069   return (__m128i)(__v4si){ __a, 0, 0, 0 };
   1070 }
   1071 
   1072 #ifdef __x86_64__
   1073 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1074 _mm_cvtsi64_si128(long long __a)
   1075 {
   1076   return (__m128i){ __a, 0 };
   1077 }
   1078 #endif
   1079 
   1080 static __inline__ int __attribute__((__always_inline__, __nodebug__))
   1081 _mm_cvtsi128_si32(__m128i __a)
   1082 {
   1083   __v4si __b = (__v4si)__a;
   1084   return __b[0];
   1085 }
   1086 
   1087 #ifdef __x86_64__
   1088 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
   1089 _mm_cvtsi128_si64(__m128i __a)
   1090 {
   1091   return __a[0];
   1092 }
   1093 #endif
   1094 
   1095 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1096 _mm_load_si128(__m128i const *__p)
   1097 {
   1098   return *__p;
   1099 }
   1100 
   1101 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1102 _mm_loadu_si128(__m128i const *__p)
   1103 {
   1104   struct __loadu_si128 {
   1105     __m128i __v;
   1106   } __attribute__((__packed__, __may_alias__));
   1107   return ((struct __loadu_si128*)__p)->__v;
   1108 }
   1109 
   1110 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1111 _mm_loadl_epi64(__m128i const *__p)
   1112 {
   1113   struct __mm_loadl_epi64_struct {
   1114     long long __u;
   1115   } __attribute__((__packed__, __may_alias__));
   1116   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
   1117 }
   1118 
   1119 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1120 _mm_set_epi64x(long long q1, long long q0)
   1121 {
   1122   return (__m128i){ q0, q1 };
   1123 }
   1124 
   1125 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1126 _mm_set_epi64(__m64 q1, __m64 q0)
   1127 {
   1128   return (__m128i){ (long long)q0, (long long)q1 };
   1129 }
   1130 
   1131 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1132 _mm_set_epi32(int i3, int i2, int i1, int i0)
   1133 {
   1134   return (__m128i)(__v4si){ i0, i1, i2, i3};
   1135 }
   1136 
   1137 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1138 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
   1139 {
   1140   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
   1141 }
   1142 
   1143 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1144 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
   1145 {
   1146   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
   1147 }
   1148 
   1149 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1150 _mm_set1_epi64x(long long __q)
   1151 {
   1152   return (__m128i){ __q, __q };
   1153 }
   1154 
   1155 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1156 _mm_set1_epi64(__m64 __q)
   1157 {
   1158   return (__m128i){ (long long)__q, (long long)__q };
   1159 }
   1160 
   1161 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1162 _mm_set1_epi32(int __i)
   1163 {
   1164   return (__m128i)(__v4si){ __i, __i, __i, __i };
   1165 }
   1166 
   1167 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1168 _mm_set1_epi16(short __w)
   1169 {
   1170   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
   1171 }
   1172 
   1173 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1174 _mm_set1_epi8(char __b)
   1175 {
   1176   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
   1177 }
   1178 
   1179 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1180 _mm_setr_epi64(__m64 q0, __m64 q1)
   1181 {
   1182   return (__m128i){ (long long)q0, (long long)q1 };
   1183 }
   1184 
   1185 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1186 _mm_setr_epi32(int i0, int i1, int i2, int i3)
   1187 {
   1188   return (__m128i)(__v4si){ i0, i1, i2, i3};
   1189 }
   1190 
   1191 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1192 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
   1193 {
   1194   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
   1195 }
   1196 
   1197 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1198 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
   1199 {
   1200   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
   1201 }
   1202 
   1203 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1204 _mm_setzero_si128(void)
   1205 {
   1206   return (__m128i){ 0LL, 0LL };
   1207 }
   1208 
   1209 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1210 _mm_store_si128(__m128i *__p, __m128i __b)
   1211 {
   1212   *__p = __b;
   1213 }
   1214 
   1215 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1216 _mm_storeu_si128(__m128i *__p, __m128i __b)
   1217 {
   1218   __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
   1219 }
   1220 
   1221 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1222 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
   1223 {
   1224   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
   1225 }
   1226 
   1227 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1228 _mm_storel_epi64(__m128i *__p, __m128i __a)
   1229 {
   1230   struct __mm_storel_epi64_struct {
   1231     long long __u;
   1232   } __attribute__((__packed__, __may_alias__));
   1233   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
   1234 }
   1235 
   1236 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1237 _mm_stream_pd(double *__p, __m128d __a)
   1238 {
   1239   __builtin_ia32_movntpd(__p, __a);
   1240 }
   1241 
   1242 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1243 _mm_stream_si128(__m128i *__p, __m128i __a)
   1244 {
   1245   __builtin_ia32_movntdq(__p, __a);
   1246 }
   1247 
   1248 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1249 _mm_stream_si32(int *__p, int __a)
   1250 {
   1251   __builtin_ia32_movnti(__p, __a);
   1252 }
   1253 
   1254 #ifdef __x86_64__
   1255 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1256 _mm_stream_si64(long long *__p, long long __a)
   1257 {
   1258   __builtin_ia32_movnti64(__p, __a);
   1259 }
   1260 #endif
   1261 
   1262 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1263 _mm_clflush(void const *__p)
   1264 {
   1265   __builtin_ia32_clflush(__p);
   1266 }
   1267 
   1268 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1269 _mm_lfence(void)
   1270 {
   1271   __builtin_ia32_lfence();
   1272 }
   1273 
   1274 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1275 _mm_mfence(void)
   1276 {
   1277   __builtin_ia32_mfence();
   1278 }
   1279 
   1280 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1281 _mm_packs_epi16(__m128i __a, __m128i __b)
   1282 {
   1283   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
   1284 }
   1285 
   1286 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1287 _mm_packs_epi32(__m128i __a, __m128i __b)
   1288 {
   1289   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
   1290 }
   1291 
   1292 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1293 _mm_packus_epi16(__m128i __a, __m128i __b)
   1294 {
   1295   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
   1296 }
   1297 
   1298 static __inline__ int __attribute__((__always_inline__, __nodebug__))
   1299 _mm_extract_epi16(__m128i __a, int __imm)
   1300 {
   1301   __v8hi __b = (__v8hi)__a;
   1302   return (unsigned short)__b[__imm & 7];
   1303 }
   1304 
   1305 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1306 _mm_insert_epi16(__m128i __a, int __b, int __imm)
   1307 {
   1308   __v8hi __c = (__v8hi)__a;
   1309   __c[__imm & 7] = __b;
   1310   return (__m128i)__c;
   1311 }
   1312 
   1313 static __inline__ int __attribute__((__always_inline__, __nodebug__))
   1314 _mm_movemask_epi8(__m128i __a)
   1315 {
   1316   return __builtin_ia32_pmovmskb128((__v16qi)__a);
   1317 }
   1318 
   1319 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
   1320   (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
   1321                                    (__v4si)_mm_set1_epi32(0), \
   1322                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
   1323                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
   1324 
   1325 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
   1326   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
   1327                                    (__v8hi)_mm_set1_epi16(0), \
   1328                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
   1329                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
   1330                                    4, 5, 6, 7); })
   1331 
   1332 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
   1333   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
   1334                                    (__v8hi)_mm_set1_epi16(0), \
   1335                                    0, 1, 2, 3, \
   1336                                    4 + (((imm) & 0x03) >> 0), \
   1337                                    4 + (((imm) & 0x0c) >> 2), \
   1338                                    4 + (((imm) & 0x30) >> 4), \
   1339                                    4 + (((imm) & 0xc0) >> 6)); })
   1340 
   1341 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1342 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
   1343 {
   1344   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
   1345 }
   1346 
   1347 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1348 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
   1349 {
   1350   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
   1351 }
   1352 
   1353 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1354 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
   1355 {
   1356   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
   1357 }
   1358 
   1359 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1360 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
   1361 {
   1362   return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
   1363 }
   1364 
   1365 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1366 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
   1367 {
   1368   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
   1369 }
   1370 
   1371 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1372 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
   1373 {
   1374   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
   1375 }
   1376 
   1377 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1378 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
   1379 {
   1380   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
   1381 }
   1382 
   1383 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1384 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
   1385 {
   1386   return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
   1387 }
   1388 
   1389 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
   1390 _mm_movepi64_pi64(__m128i __a)
   1391 {
   1392   return (__m64)__a[0];
   1393 }
   1394 
   1395 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1396 _mm_movpi64_epi64(__m64 __a)
   1397 {
   1398   return (__m128i){ (long long)__a, 0 };
   1399 }
   1400 
   1401 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1402 _mm_move_epi64(__m128i __a)
   1403 {
   1404   return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
   1405 }
   1406 
   1407 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
   1408 _mm_unpackhi_pd(__m128d __a, __m128d __b)
   1409 {
   1410   return __builtin_shufflevector(__a, __b, 1, 2+1);
   1411 }
   1412 
   1413 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
   1414 _mm_unpacklo_pd(__m128d __a, __m128d __b)
   1415 {
   1416   return __builtin_shufflevector(__a, __b, 0, 2+0);
   1417 }
   1418 
   1419 static __inline__ int __attribute__((__always_inline__, __nodebug__))
   1420 _mm_movemask_pd(__m128d __a)
   1421 {
   1422   return __builtin_ia32_movmskpd(__a);
   1423 }
   1424 
   1425 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
   1426   __builtin_shufflevector((__m128d)(a), (__m128d)(b), \
   1427                           (i) & 1, (((i) & 2) >> 1) + 2); })
   1428 
   1429 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
   1430 _mm_castpd_ps(__m128d __a)
   1431 {
   1432   return (__m128)__a;
   1433 }
   1434 
   1435 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1436 _mm_castpd_si128(__m128d __a)
   1437 {
   1438   return (__m128i)__a;
   1439 }
   1440 
   1441 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
   1442 _mm_castps_pd(__m128 __a)
   1443 {
   1444   return (__m128d)__a;
   1445 }
   1446 
   1447 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1448 _mm_castps_si128(__m128 __a)
   1449 {
   1450   return (__m128i)__a;
   1451 }
   1452 
   1453 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
   1454 _mm_castsi128_ps(__m128i __a)
   1455 {
   1456   return (__m128)__a;
   1457 }
   1458 
   1459 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
   1460 _mm_castsi128_pd(__m128i __a)
   1461 {
   1462   return (__m128d)__a;
   1463 }
   1464 
   1465 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1466 _mm_pause(void)
   1467 {
   1468   __asm__ volatile ("pause");
   1469 }
   1470 
   1471 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
   1472 
   1473 #endif /* __SSE2__ */
   1474 
   1475 #endif /* __EMMINTRIN_H */
   1476