Home | History | Annotate | Download | only in clang-include
      1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __EMMINTRIN_H
     25 #define __EMMINTRIN_H
     26 
     27 #ifndef __SSE2__
     28 #error "SSE2 instruction set not enabled"
     29 #else
     30 
     31 #include <xmmintrin.h>
     32 
     33 typedef double __m128d __attribute__((__vector_size__(16)));
     34 typedef long long __m128i __attribute__((__vector_size__(16)));
     35 
     36 /* Type defines.  */
     37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
     38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
     39 typedef short __v8hi __attribute__((__vector_size__(16)));
     40 typedef char __v16qi __attribute__((__vector_size__(16)));
     41 
     42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     43 _mm_add_sd(__m128d a, __m128d b)
     44 {
     45   a[0] += b[0];
     46   return a;
     47 }
     48 
     49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     50 _mm_add_pd(__m128d a, __m128d b)
     51 {
     52   return a + b;
     53 }
     54 
     55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     56 _mm_sub_sd(__m128d a, __m128d b)
     57 {
     58   a[0] -= b[0];
     59   return a;
     60 }
     61 
     62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     63 _mm_sub_pd(__m128d a, __m128d b)
     64 {
     65   return a - b;
     66 }
     67 
     68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     69 _mm_mul_sd(__m128d a, __m128d b)
     70 {
     71   a[0] *= b[0];
     72   return a;
     73 }
     74 
     75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     76 _mm_mul_pd(__m128d a, __m128d b)
     77 {
     78   return a * b;
     79 }
     80 
     81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     82 _mm_div_sd(__m128d a, __m128d b)
     83 {
     84   a[0] /= b[0];
     85   return a;
     86 }
     87 
     88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     89 _mm_div_pd(__m128d a, __m128d b)
     90 {
     91   return a / b;
     92 }
     93 
     94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     95 _mm_sqrt_sd(__m128d a, __m128d b)
     96 {
     97   __m128d c = __builtin_ia32_sqrtsd(b);
     98   return (__m128d) { c[0], a[1] };
     99 }
    100 
    101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    102 _mm_sqrt_pd(__m128d a)
    103 {
    104   return __builtin_ia32_sqrtpd(a);
    105 }
    106 
    107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    108 _mm_min_sd(__m128d a, __m128d b)
    109 {
    110   return __builtin_ia32_minsd(a, b);
    111 }
    112 
    113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    114 _mm_min_pd(__m128d a, __m128d b)
    115 {
    116   return __builtin_ia32_minpd(a, b);
    117 }
    118 
    119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    120 _mm_max_sd(__m128d a, __m128d b)
    121 {
    122   return __builtin_ia32_maxsd(a, b);
    123 }
    124 
    125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    126 _mm_max_pd(__m128d a, __m128d b)
    127 {
    128   return __builtin_ia32_maxpd(a, b);
    129 }
    130 
    131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    132 _mm_and_pd(__m128d a, __m128d b)
    133 {
    134   return (__m128d)((__v4si)a & (__v4si)b);
    135 }
    136 
    137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    138 _mm_andnot_pd(__m128d a, __m128d b)
    139 {
    140   return (__m128d)(~(__v4si)a & (__v4si)b);
    141 }
    142 
    143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    144 _mm_or_pd(__m128d a, __m128d b)
    145 {
    146   return (__m128d)((__v4si)a | (__v4si)b);
    147 }
    148 
    149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    150 _mm_xor_pd(__m128d a, __m128d b)
    151 {
    152   return (__m128d)((__v4si)a ^ (__v4si)b);
    153 }
    154 
    155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    156 _mm_cmpeq_pd(__m128d a, __m128d b)
    157 {
    158   return (__m128d)__builtin_ia32_cmppd(a, b, 0);
    159 }
    160 
    161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    162 _mm_cmplt_pd(__m128d a, __m128d b)
    163 {
    164   return (__m128d)__builtin_ia32_cmppd(a, b, 1);
    165 }
    166 
    167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    168 _mm_cmple_pd(__m128d a, __m128d b)
    169 {
    170   return (__m128d)__builtin_ia32_cmppd(a, b, 2);
    171 }
    172 
    173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    174 _mm_cmpgt_pd(__m128d a, __m128d b)
    175 {
    176   return (__m128d)__builtin_ia32_cmppd(b, a, 1);
    177 }
    178 
    179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    180 _mm_cmpge_pd(__m128d a, __m128d b)
    181 {
    182   return (__m128d)__builtin_ia32_cmppd(b, a, 2);
    183 }
    184 
    185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    186 _mm_cmpord_pd(__m128d a, __m128d b)
    187 {
    188   return (__m128d)__builtin_ia32_cmppd(a, b, 7);
    189 }
    190 
    191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    192 _mm_cmpunord_pd(__m128d a, __m128d b)
    193 {
    194   return (__m128d)__builtin_ia32_cmppd(a, b, 3);
    195 }
    196 
    197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    198 _mm_cmpneq_pd(__m128d a, __m128d b)
    199 {
    200   return (__m128d)__builtin_ia32_cmppd(a, b, 4);
    201 }
    202 
    203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    204 _mm_cmpnlt_pd(__m128d a, __m128d b)
    205 {
    206   return (__m128d)__builtin_ia32_cmppd(a, b, 5);
    207 }
    208 
    209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    210 _mm_cmpnle_pd(__m128d a, __m128d b)
    211 {
    212   return (__m128d)__builtin_ia32_cmppd(a, b, 6);
    213 }
    214 
    215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    216 _mm_cmpngt_pd(__m128d a, __m128d b)
    217 {
    218   return (__m128d)__builtin_ia32_cmppd(b, a, 5);
    219 }
    220 
    221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    222 _mm_cmpnge_pd(__m128d a, __m128d b)
    223 {
    224   return (__m128d)__builtin_ia32_cmppd(b, a, 6);
    225 }
    226 
    227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    228 _mm_cmpeq_sd(__m128d a, __m128d b)
    229 {
    230   return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
    231 }
    232 
    233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    234 _mm_cmplt_sd(__m128d a, __m128d b)
    235 {
    236   return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
    237 }
    238 
    239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    240 _mm_cmple_sd(__m128d a, __m128d b)
    241 {
    242   return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
    243 }
    244 
    245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    246 _mm_cmpgt_sd(__m128d a, __m128d b)
    247 {
    248   return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
    249 }
    250 
    251 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    252 _mm_cmpge_sd(__m128d a, __m128d b)
    253 {
    254   return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
    255 }
    256 
    257 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    258 _mm_cmpord_sd(__m128d a, __m128d b)
    259 {
    260   return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
    261 }
    262 
    263 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    264 _mm_cmpunord_sd(__m128d a, __m128d b)
    265 {
    266   return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
    267 }
    268 
    269 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    270 _mm_cmpneq_sd(__m128d a, __m128d b)
    271 {
    272   return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
    273 }
    274 
    275 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    276 _mm_cmpnlt_sd(__m128d a, __m128d b)
    277 {
    278   return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
    279 }
    280 
    281 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    282 _mm_cmpnle_sd(__m128d a, __m128d b)
    283 {
    284   return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
    285 }
    286 
    287 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    288 _mm_cmpngt_sd(__m128d a, __m128d b)
    289 {
    290   return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
    291 }
    292 
    293 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    294 _mm_cmpnge_sd(__m128d a, __m128d b)
    295 {
    296   return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
    297 }
    298 
    299 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    300 _mm_comieq_sd(__m128d a, __m128d b)
    301 {
    302   return __builtin_ia32_comisdeq(a, b);
    303 }
    304 
    305 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    306 _mm_comilt_sd(__m128d a, __m128d b)
    307 {
    308   return __builtin_ia32_comisdlt(a, b);
    309 }
    310 
    311 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    312 _mm_comile_sd(__m128d a, __m128d b)
    313 {
    314   return __builtin_ia32_comisdle(a, b);
    315 }
    316 
    317 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    318 _mm_comigt_sd(__m128d a, __m128d b)
    319 {
    320   return __builtin_ia32_comisdgt(a, b);
    321 }
    322 
    323 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    324 _mm_comineq_sd(__m128d a, __m128d b)
    325 {
    326   return __builtin_ia32_comisdneq(a, b);
    327 }
    328 
    329 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    330 _mm_ucomieq_sd(__m128d a, __m128d b)
    331 {
    332   return __builtin_ia32_ucomisdeq(a, b);
    333 }
    334 
    335 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    336 _mm_ucomilt_sd(__m128d a, __m128d b)
    337 {
    338   return __builtin_ia32_ucomisdlt(a, b);
    339 }
    340 
    341 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    342 _mm_ucomile_sd(__m128d a, __m128d b)
    343 {
    344   return __builtin_ia32_ucomisdle(a, b);
    345 }
    346 
    347 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    348 _mm_ucomigt_sd(__m128d a, __m128d b)
    349 {
    350   return __builtin_ia32_ucomisdgt(a, b);
    351 }
    352 
    353 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    354 _mm_ucomineq_sd(__m128d a, __m128d b)
    355 {
    356   return __builtin_ia32_ucomisdneq(a, b);
    357 }
    358 
    359 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    360 _mm_cvtpd_ps(__m128d a)
    361 {
    362   return __builtin_ia32_cvtpd2ps(a);
    363 }
    364 
    365 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    366 _mm_cvtps_pd(__m128 a)
    367 {
    368   return __builtin_ia32_cvtps2pd(a);
    369 }
    370 
    371 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    372 _mm_cvtepi32_pd(__m128i a)
    373 {
    374   return __builtin_ia32_cvtdq2pd((__v4si)a);
    375 }
    376 
    377 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    378 _mm_cvtpd_epi32(__m128d a)
    379 {
    380   return __builtin_ia32_cvtpd2dq(a);
    381 }
    382 
    383 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    384 _mm_cvtsd_si32(__m128d a)
    385 {
    386   return __builtin_ia32_cvtsd2si(a);
    387 }
    388 
    389 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    390 _mm_cvtsd_ss(__m128 a, __m128d b)
    391 {
    392   a[0] = b[0];
    393   return a;
    394 }
    395 
    396 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    397 _mm_cvtsi32_sd(__m128d a, int b)
    398 {
    399   a[0] = b;
    400   return a;
    401 }
    402 
    403 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    404 _mm_cvtss_sd(__m128d a, __m128 b)
    405 {
    406   a[0] = b[0];
    407   return a;
    408 }
    409 
    410 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    411 _mm_cvttpd_epi32(__m128d a)
    412 {
    413   return (__m128i)__builtin_ia32_cvttpd2dq(a);
    414 }
    415 
    416 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    417 _mm_cvttsd_si32(__m128d a)
    418 {
    419   return a[0];
    420 }
    421 
    422 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    423 _mm_cvtpd_pi32(__m128d a)
    424 {
    425   return (__m64)__builtin_ia32_cvtpd2pi(a);
    426 }
    427 
    428 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    429 _mm_cvttpd_pi32(__m128d a)
    430 {
    431   return (__m64)__builtin_ia32_cvttpd2pi(a);
    432 }
    433 
    434 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    435 _mm_cvtpi32_pd(__m64 a)
    436 {
    437   return __builtin_ia32_cvtpi2pd((__v2si)a);
    438 }
    439 
    440 static __inline__ double __attribute__((__always_inline__, __nodebug__))
    441 _mm_cvtsd_f64(__m128d a)
    442 {
    443   return a[0];
    444 }
    445 
    446 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    447 _mm_load_pd(double const *dp)
    448 {
    449   return *(__m128d*)dp;
    450 }
    451 
    452 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    453 _mm_load1_pd(double const *dp)
    454 {
    455   return (__m128d){ dp[0], dp[0] };
    456 }
    457 
    458 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
    459 
    460 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    461 _mm_loadr_pd(double const *dp)
    462 {
    463   return (__m128d){ dp[1], dp[0] };
    464 }
    465 
    466 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    467 _mm_loadu_pd(double const *dp)
    468 {
    469   return __builtin_ia32_loadupd(dp);
    470 }
    471 
    472 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    473 _mm_load_sd(double const *dp)
    474 {
    475   return (__m128d){ *dp, 0.0 };
    476 }
    477 
    478 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    479 _mm_loadh_pd(__m128d a, double const *dp)
    480 {
    481   return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
    482 }
    483 
    484 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    485 _mm_loadl_pd(__m128d a, double const *dp)
    486 {
    487   return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
    488 }
    489 
    490 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    491 _mm_set_sd(double w)
    492 {
    493   return (__m128d){ w, 0 };
    494 }
    495 
    496 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    497 _mm_set1_pd(double w)
    498 {
    499   return (__m128d){ w, w };
    500 }
    501 
    502 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    503 _mm_set_pd(double w, double x)
    504 {
    505   return (__m128d){ x, w };
    506 }
    507 
    508 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    509 _mm_setr_pd(double w, double x)
    510 {
    511   return (__m128d){ w, x };
    512 }
    513 
    514 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    515 _mm_setzero_pd(void)
    516 {
    517   return (__m128d){ 0, 0 };
    518 }
    519 
    520 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    521 _mm_move_sd(__m128d a, __m128d b)
    522 {
    523   return (__m128d){ b[0], a[1] };
    524 }
    525 
    526 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    527 _mm_store_sd(double *dp, __m128d a)
    528 {
    529   dp[0] = a[0];
    530 }
    531 
    532 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    533 _mm_store1_pd(double *dp, __m128d a)
    534 {
    535   dp[0] = a[0];
    536   dp[1] = a[0];
    537 }
    538 
    539 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    540 _mm_store_pd(double *dp, __m128d a)
    541 {
    542   *(__m128d *)dp = a;
    543 }
    544 
    545 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    546 _mm_storeu_pd(double *dp, __m128d a)
    547 {
    548   __builtin_ia32_storeupd(dp, a);
    549 }
    550 
    551 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    552 _mm_storer_pd(double *dp, __m128d a)
    553 {
    554   dp[0] = a[1];
    555   dp[1] = a[0];
    556 }
    557 
    558 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    559 _mm_storeh_pd(double *dp, __m128d a)
    560 {
    561   dp[0] = a[1];
    562 }
    563 
    564 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    565 _mm_storel_pd(double *dp, __m128d a)
    566 {
    567   dp[0] = a[0];
    568 }
    569 
    570 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    571 _mm_add_epi8(__m128i a, __m128i b)
    572 {
    573   return (__m128i)((__v16qi)a + (__v16qi)b);
    574 }
    575 
    576 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    577 _mm_add_epi16(__m128i a, __m128i b)
    578 {
    579   return (__m128i)((__v8hi)a + (__v8hi)b);
    580 }
    581 
    582 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    583 _mm_add_epi32(__m128i a, __m128i b)
    584 {
    585   return (__m128i)((__v4si)a + (__v4si)b);
    586 }
    587 
    588 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    589 _mm_add_si64(__m64 a, __m64 b)
    590 {
    591   return a + b;
    592 }
    593 
    594 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    595 _mm_add_epi64(__m128i a, __m128i b)
    596 {
    597   return a + b;
    598 }
    599 
    600 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    601 _mm_adds_epi8(__m128i a, __m128i b)
    602 {
    603   return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
    604 }
    605 
    606 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    607 _mm_adds_epi16(__m128i a, __m128i b)
    608 {
    609   return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
    610 }
    611 
    612 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    613 _mm_adds_epu8(__m128i a, __m128i b)
    614 {
    615   return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
    616 }
    617 
    618 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    619 _mm_adds_epu16(__m128i a, __m128i b)
    620 {
    621   return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
    622 }
    623 
    624 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    625 _mm_avg_epu8(__m128i a, __m128i b)
    626 {
    627   return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
    628 }
    629 
    630 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    631 _mm_avg_epu16(__m128i a, __m128i b)
    632 {
    633   return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
    634 }
    635 
    636 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    637 _mm_madd_epi16(__m128i a, __m128i b)
    638 {
    639   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
    640 }
    641 
    642 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    643 _mm_max_epi16(__m128i a, __m128i b)
    644 {
    645   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
    646 }
    647 
    648 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    649 _mm_max_epu8(__m128i a, __m128i b)
    650 {
    651   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
    652 }
    653 
    654 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    655 _mm_min_epi16(__m128i a, __m128i b)
    656 {
    657   return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
    658 }
    659 
    660 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    661 _mm_min_epu8(__m128i a, __m128i b)
    662 {
    663   return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
    664 }
    665 
    666 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    667 _mm_mulhi_epi16(__m128i a, __m128i b)
    668 {
    669   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
    670 }
    671 
    672 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    673 _mm_mulhi_epu16(__m128i a, __m128i b)
    674 {
    675   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
    676 }
    677 
    678 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    679 _mm_mullo_epi16(__m128i a, __m128i b)
    680 {
    681   return (__m128i)((__v8hi)a * (__v8hi)b);
    682 }
    683 
    684 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    685 _mm_mul_su32(__m64 a, __m64 b)
    686 {
    687   return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
    688 }
    689 
    690 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    691 _mm_mul_epu32(__m128i a, __m128i b)
    692 {
    693   return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
    694 }
    695 
    696 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    697 _mm_sad_epu8(__m128i a, __m128i b)
    698 {
    699   return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
    700 }
    701 
    702 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    703 _mm_sub_epi8(__m128i a, __m128i b)
    704 {
    705   return (__m128i)((__v16qi)a - (__v16qi)b);
    706 }
    707 
    708 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    709 _mm_sub_epi16(__m128i a, __m128i b)
    710 {
    711   return (__m128i)((__v8hi)a - (__v8hi)b);
    712 }
    713 
    714 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    715 _mm_sub_epi32(__m128i a, __m128i b)
    716 {
    717   return (__m128i)((__v4si)a - (__v4si)b);
    718 }
    719 
    720 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    721 _mm_sub_si64(__m64 a, __m64 b)
    722 {
    723   return a - b;
    724 }
    725 
    726 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    727 _mm_sub_epi64(__m128i a, __m128i b)
    728 {
    729   return a - b;
    730 }
    731 
    732 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    733 _mm_subs_epi8(__m128i a, __m128i b)
    734 {
    735   return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
    736 }
    737 
    738 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    739 _mm_subs_epi16(__m128i a, __m128i b)
    740 {
    741   return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
    742 }
    743 
    744 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    745 _mm_subs_epu8(__m128i a, __m128i b)
    746 {
    747   return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
    748 }
    749 
    750 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    751 _mm_subs_epu16(__m128i a, __m128i b)
    752 {
    753   return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
    754 }
    755 
    756 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    757 _mm_and_si128(__m128i a, __m128i b)
    758 {
    759   return a & b;
    760 }
    761 
    762 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    763 _mm_andnot_si128(__m128i a, __m128i b)
    764 {
    765   return ~a & b;
    766 }
    767 
    768 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    769 _mm_or_si128(__m128i a, __m128i b)
    770 {
    771   return a | b;
    772 }
    773 
    774 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    775 _mm_xor_si128(__m128i a, __m128i b)
    776 {
    777   return a ^ b;
    778 }
    779 
    780 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    781 _mm_slli_si128(__m128i a, int imm)
    782 {
    783   return __builtin_ia32_pslldqi128(a, imm * 8);
    784 }
    785 
    786 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    787 _mm_slli_epi16(__m128i a, int count)
    788 {
    789   return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
    790 }
    791 
    792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    793 _mm_sll_epi16(__m128i a, __m128i count)
    794 {
    795   return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
    796 }
    797 
    798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    799 _mm_slli_epi32(__m128i a, int count)
    800 {
    801   return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
    802 }
    803 
    804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    805 _mm_sll_epi32(__m128i a, __m128i count)
    806 {
    807   return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
    808 }
    809 
    810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    811 _mm_slli_epi64(__m128i a, int count)
    812 {
    813   return __builtin_ia32_psllqi128(a, count);
    814 }
    815 
    816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    817 _mm_sll_epi64(__m128i a, __m128i count)
    818 {
    819   return __builtin_ia32_psllq128(a, count);
    820 }
    821 
    822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    823 _mm_srai_epi16(__m128i a, int count)
    824 {
    825   return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
    826 }
    827 
    828 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    829 _mm_sra_epi16(__m128i a, __m128i count)
    830 {
    831   return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
    832 }
    833 
    834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    835 _mm_srai_epi32(__m128i a, int count)
    836 {
    837   return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
    838 }
    839 
    840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    841 _mm_sra_epi32(__m128i a, __m128i count)
    842 {
    843   return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
    844 }
    845 
    846 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    847 _mm_srli_si128(__m128i a, int imm)
    848 {
    849   return __builtin_ia32_psrldqi128(a, imm * 8);
    850 }
    851 
    852 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    853 _mm_srli_epi16(__m128i a, int count)
    854 {
    855   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
    856 }
    857 
    858 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    859 _mm_srl_epi16(__m128i a, __m128i count)
    860 {
    861   return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
    862 }
    863 
    864 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    865 _mm_srli_epi32(__m128i a, int count)
    866 {
    867   return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
    868 }
    869 
    870 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    871 _mm_srl_epi32(__m128i a, __m128i count)
    872 {
    873   return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
    874 }
    875 
    876 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    877 _mm_srli_epi64(__m128i a, int count)
    878 {
    879   return __builtin_ia32_psrlqi128(a, count);
    880 }
    881 
    882 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    883 _mm_srl_epi64(__m128i a, __m128i count)
    884 {
    885   return __builtin_ia32_psrlq128(a, count);
    886 }
    887 
    888 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    889 _mm_cmpeq_epi8(__m128i a, __m128i b)
    890 {
    891   return (__m128i)((__v16qi)a == (__v16qi)b);
    892 }
    893 
    894 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    895 _mm_cmpeq_epi16(__m128i a, __m128i b)
    896 {
    897   return (__m128i)((__v8hi)a == (__v8hi)b);
    898 }
    899 
    900 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    901 _mm_cmpeq_epi32(__m128i a, __m128i b)
    902 {
    903   return (__m128i)((__v4si)a == (__v4si)b);
    904 }
    905 
    906 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    907 _mm_cmpgt_epi8(__m128i a, __m128i b)
    908 {
    909   return (__m128i)((__v16qi)a > (__v16qi)b);
    910 }
    911 
    912 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    913 _mm_cmpgt_epi16(__m128i a, __m128i b)
    914 {
    915   return (__m128i)((__v8hi)a > (__v8hi)b);
    916 }
    917 
    918 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    919 _mm_cmpgt_epi32(__m128i a, __m128i b)
    920 {
    921   return (__m128i)((__v4si)a > (__v4si)b);
    922 }
    923 
    924 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    925 _mm_cmplt_epi8(__m128i a, __m128i b)
    926 {
    927   return _mm_cmpgt_epi8(b,a);
    928 }
    929 
    930 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    931 _mm_cmplt_epi16(__m128i a, __m128i b)
    932 {
    933   return _mm_cmpgt_epi16(b,a);
    934 }
    935 
    936 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    937 _mm_cmplt_epi32(__m128i a, __m128i b)
    938 {
    939   return _mm_cmpgt_epi32(b,a);
    940 }
    941 
    942 #ifdef __x86_64__
    943 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    944 _mm_cvtsi64_sd(__m128d a, long long b)
    945 {
    946   a[0] = b;
    947   return a;
    948 }
    949 
    950 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
    951 _mm_cvtsd_si64(__m128d a)
    952 {
    953   return __builtin_ia32_cvtsd2si64(a);
    954 }
    955 
    956 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
    957 _mm_cvttsd_si64(__m128d a)
    958 {
    959   return a[0];
    960 }
    961 #endif
    962 
    963 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    964 _mm_cvtepi32_ps(__m128i a)
    965 {
    966   return __builtin_ia32_cvtdq2ps((__v4si)a);
    967 }
    968 
    969 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    970 _mm_cvtps_epi32(__m128 a)
    971 {
    972   return (__m128i)__builtin_ia32_cvtps2dq(a);
    973 }
    974 
    975 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    976 _mm_cvttps_epi32(__m128 a)
    977 {
    978   return (__m128i)__builtin_ia32_cvttps2dq(a);
    979 }
    980 
    981 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    982 _mm_cvtsi32_si128(int a)
    983 {
    984   return (__m128i)(__v4si){ a, 0, 0, 0 };
    985 }
    986 
    987 #ifdef __x86_64__
    988 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
    989 _mm_cvtsi64_si128(long long a)
    990 {
    991   return (__m128i){ a, 0 };
    992 }
    993 #endif
    994 
    995 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    996 _mm_cvtsi128_si32(__m128i a)
    997 {
    998   __v4si b = (__v4si)a;
    999   return b[0];
   1000 }
   1001 
   1002 #ifdef __x86_64__
   1003 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
   1004 _mm_cvtsi128_si64(__m128i a)
   1005 {
   1006   return a[0];
   1007 }
   1008 #endif
   1009 
   1010 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1011 _mm_load_si128(__m128i const *p)
   1012 {
   1013   return *p;
   1014 }
   1015 
   1016 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1017 _mm_loadu_si128(__m128i const *p)
   1018 {
   1019   return (__m128i)__builtin_ia32_loaddqu((char const *)p);
   1020 }
   1021 
   1022 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1023 _mm_loadl_epi64(__m128i const *p)
   1024 {
   1025   return (__m128i) { *(long long*)p, 0};
   1026 }
   1027 
   1028 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1029 _mm_set_epi64x(long long q1, long long q0)
   1030 {
   1031   return (__m128i){ q0, q1 };
   1032 }
   1033 
   1034 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1035 _mm_set_epi64(__m64 q1, __m64 q0)
   1036 {
   1037   return (__m128i){ (long long)q0, (long long)q1 };
   1038 }
   1039 
   1040 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1041 _mm_set_epi32(int i3, int i2, int i1, int i0)
   1042 {
   1043   return (__m128i)(__v4si){ i0, i1, i2, i3};
   1044 }
   1045 
   1046 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1047 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
   1048 {
   1049   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
   1050 }
   1051 
   1052 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1053 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
   1054 {
   1055   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
   1056 }
   1057 
   1058 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1059 _mm_set1_epi64x(long long q)
   1060 {
   1061   return (__m128i){ q, q };
   1062 }
   1063 
   1064 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1065 _mm_set1_epi64(__m64 q)
   1066 {
   1067   return (__m128i){ (long long)q, (long long)q };
   1068 }
   1069 
   1070 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1071 _mm_set1_epi32(int i)
   1072 {
   1073   return (__m128i)(__v4si){ i, i, i, i };
   1074 }
   1075 
   1076 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1077 _mm_set1_epi16(short w)
   1078 {
   1079   return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
   1080 }
   1081 
   1082 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1083 _mm_set1_epi8(char b)
   1084 {
   1085   return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
   1086 }
   1087 
   1088 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1089 _mm_setr_epi64(__m64 q0, __m64 q1)
   1090 {
   1091   return (__m128i){ (long long)q0, (long long)q1 };
   1092 }
   1093 
   1094 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1095 _mm_setr_epi32(int i0, int i1, int i2, int i3)
   1096 {
   1097   return (__m128i)(__v4si){ i0, i1, i2, i3};
   1098 }
   1099 
   1100 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1101 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
   1102 {
   1103   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
   1104 }
   1105 
   1106 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1107 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
   1108 {
   1109   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
   1110 }
   1111 
   1112 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1113 _mm_setzero_si128(void)
   1114 {
   1115   return (__m128i){ 0LL, 0LL };
   1116 }
   1117 
   1118 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1119 _mm_store_si128(__m128i *p, __m128i b)
   1120 {
   1121   *p = b;
   1122 }
   1123 
   1124 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1125 _mm_storeu_si128(__m128i *p, __m128i b)
   1126 {
   1127   __builtin_ia32_storedqu((char *)p, (__v16qi)b);
   1128 }
   1129 
   1130 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1131 _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
   1132 {
   1133   __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
   1134 }
   1135 
   1136 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1137 _mm_storel_epi64(__m128i *p, __m128i a)
   1138 {
   1139   __builtin_ia32_storelv4si((__v2si *)p, a);
   1140 }
   1141 
   1142 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1143 _mm_stream_pd(double *p, __m128d a)
   1144 {
   1145   __builtin_ia32_movntpd(p, a);
   1146 }
   1147 
   1148 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1149 _mm_stream_si128(__m128i *p, __m128i a)
   1150 {
   1151   __builtin_ia32_movntdq(p, a);
   1152 }
   1153 
   1154 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1155 _mm_stream_si32(int *p, int a)
   1156 {
   1157   __builtin_ia32_movnti(p, a);
   1158 }
   1159 
   1160 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1161 _mm_clflush(void const *p)
   1162 {
   1163   __builtin_ia32_clflush(p);
   1164 }
   1165 
   1166 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1167 _mm_lfence(void)
   1168 {
   1169   __builtin_ia32_lfence();
   1170 }
   1171 
   1172 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1173 _mm_mfence(void)
   1174 {
   1175   __builtin_ia32_mfence();
   1176 }
   1177 
   1178 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1179 _mm_packs_epi16(__m128i a, __m128i b)
   1180 {
   1181   return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
   1182 }
   1183 
   1184 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1185 _mm_packs_epi32(__m128i a, __m128i b)
   1186 {
   1187   return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
   1188 }
   1189 
   1190 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1191 _mm_packus_epi16(__m128i a, __m128i b)
   1192 {
   1193   return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
   1194 }
   1195 
   1196 static __inline__ int __attribute__((__always_inline__, __nodebug__))
   1197 _mm_extract_epi16(__m128i a, int imm)
   1198 {
   1199   __v8hi b = (__v8hi)a;
   1200   return (unsigned short)b[imm];
   1201 }
   1202 
   1203 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1204 _mm_insert_epi16(__m128i a, int b, int imm)
   1205 {
   1206   __v8hi c = (__v8hi)a;
   1207   c[imm & 7] = b;
   1208   return (__m128i)c;
   1209 }
   1210 
   1211 static __inline__ int __attribute__((__always_inline__, __nodebug__))
   1212 _mm_movemask_epi8(__m128i a)
   1213 {
   1214   return __builtin_ia32_pmovmskb128((__v16qi)a);
   1215 }
   1216 
   1217 #define _mm_shuffle_epi32(a, imm) \
   1218   ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) {0}, \
   1219                                     (imm) & 0x3, ((imm) & 0xc) >> 2, \
   1220                                     ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
   1221 #define _mm_shufflelo_epi16(a, imm) \
   1222   ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, \
   1223                                     (imm) & 0x3, ((imm) & 0xc) >> 2, \
   1224                                     ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
   1225                                     4, 5, 6, 7))
   1226 #define _mm_shufflehi_epi16(a, imm) \
   1227   ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, 0, 1, 2, 3, \
   1228                                     4 + (((imm) & 0x03) >> 0), \
   1229                                     4 + (((imm) & 0x0c) >> 2), \
   1230                                     4 + (((imm) & 0x30) >> 4), \
   1231                                     4 + (((imm) & 0xc0) >> 6)))
   1232 
   1233 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1234 _mm_unpackhi_epi8(__m128i a, __m128i b)
   1235 {
   1236   return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
   1237 }
   1238 
   1239 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1240 _mm_unpackhi_epi16(__m128i a, __m128i b)
   1241 {
   1242   return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
   1243 }
   1244 
   1245 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1246 _mm_unpackhi_epi32(__m128i a, __m128i b)
   1247 {
   1248   return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
   1249 }
   1250 
   1251 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1252 _mm_unpackhi_epi64(__m128i a, __m128i b)
   1253 {
   1254   return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
   1255 }
   1256 
   1257 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1258 _mm_unpacklo_epi8(__m128i a, __m128i b)
   1259 {
   1260   return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
   1261 }
   1262 
   1263 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1264 _mm_unpacklo_epi16(__m128i a, __m128i b)
   1265 {
   1266   return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
   1267 }
   1268 
   1269 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1270 _mm_unpacklo_epi32(__m128i a, __m128i b)
   1271 {
   1272   return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
   1273 }
   1274 
   1275 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1276 _mm_unpacklo_epi64(__m128i a, __m128i b)
   1277 {
   1278   return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
   1279 }
   1280 
   1281 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
   1282 _mm_movepi64_pi64(__m128i a)
   1283 {
   1284   return (__m64)a[0];
   1285 }
   1286 
   1287 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1288 _mm_movpi64_pi64(__m64 a)
   1289 {
   1290   return (__m128i){ (long long)a, 0 };
   1291 }
   1292 
   1293 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1294 _mm_move_epi64(__m128i a)
   1295 {
   1296   return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
   1297 }
   1298 
   1299 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
   1300 _mm_unpackhi_pd(__m128d a, __m128d b)
   1301 {
   1302   return __builtin_shufflevector(a, b, 1, 2+1);
   1303 }
   1304 
   1305 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
   1306 _mm_unpacklo_pd(__m128d a, __m128d b)
   1307 {
   1308   return __builtin_shufflevector(a, b, 0, 2+0);
   1309 }
   1310 
   1311 static __inline__ int __attribute__((__always_inline__, __nodebug__))
   1312 _mm_movemask_pd(__m128d a)
   1313 {
   1314   return __builtin_ia32_movmskpd(a);
   1315 }
   1316 
   1317 #define _mm_shuffle_pd(a, b, i) \
   1318   (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \
   1319                                                        (((i) & 2) >> 1) + 2))
   1320 
   1321 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
   1322 _mm_castpd_ps(__m128d in)
   1323 {
   1324   return (__m128)in;
   1325 }
   1326 
   1327 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1328 _mm_castpd_si128(__m128d in)
   1329 {
   1330   return (__m128i)in;
   1331 }
   1332 
   1333 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
   1334 _mm_castps_pd(__m128 in)
   1335 {
   1336   return (__m128d)in;
   1337 }
   1338 
   1339 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
   1340 _mm_castps_si128(__m128 in)
   1341 {
   1342   return (__m128i)in;
   1343 }
   1344 
   1345 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
   1346 _mm_castsi128_ps(__m128i in)
   1347 {
   1348   return (__m128)in;
   1349 }
   1350 
   1351 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
   1352 _mm_castsi128_pd(__m128i in)
   1353 {
   1354   return (__m128d)in;
   1355 }
   1356 
   1357 static __inline__ void __attribute__((__always_inline__, __nodebug__))
   1358 _mm_pause(void)
   1359 {
   1360   __asm__ volatile ("pause");
   1361 }
   1362 
   1363 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
   1364 
   1365 #endif /* __SSE2__ */
   1366 
   1367 #endif /* __EMMINTRIN_H */
   1368