Home | History | Annotate | Download | only in clang-include
      1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __XMMINTRIN_H
     25 #define __XMMINTRIN_H
     26 
     27 #ifndef __SSE__
     28 #error "SSE instruction set not enabled"
     29 #else
     30 
     31 #include <mmintrin.h>
     32 
     33 typedef int __v4si __attribute__((__vector_size__(16)));
     34 typedef float __v4sf __attribute__((__vector_size__(16)));
     35 typedef float __m128 __attribute__((__vector_size__(16)));
     36 
     37 #include <mm_malloc.h>
     38 
     39 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     40 _mm_add_ss(__m128 a, __m128 b)
     41 {
     42   a[0] += b[0];
     43   return a;
     44 }
     45 
     46 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     47 _mm_add_ps(__m128 a, __m128 b)
     48 {
     49   return a + b;
     50 }
     51 
     52 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     53 _mm_sub_ss(__m128 a, __m128 b)
     54 {
     55   a[0] -= b[0];
     56   return a;
     57 }
     58 
     59 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     60 _mm_sub_ps(__m128 a, __m128 b)
     61 {
     62   return a - b;
     63 }
     64 
     65 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     66 _mm_mul_ss(__m128 a, __m128 b)
     67 {
     68   a[0] *= b[0];
     69   return a;
     70 }
     71 
     72 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     73 _mm_mul_ps(__m128 a, __m128 b)
     74 {
     75   return a * b;
     76 }
     77 
     78 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     79 _mm_div_ss(__m128 a, __m128 b)
     80 {
     81   a[0] /= b[0];
     82   return a;
     83 }
     84 
     85 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     86 _mm_div_ps(__m128 a, __m128 b)
     87 {
     88   return a / b;
     89 }
     90 
     91 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     92 _mm_sqrt_ss(__m128 a)
     93 {
     94   return __builtin_ia32_sqrtss(a);
     95 }
     96 
     97 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     98 _mm_sqrt_ps(__m128 a)
     99 {
    100   return __builtin_ia32_sqrtps(a);
    101 }
    102 
    103 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    104 _mm_rcp_ss(__m128 a)
    105 {
    106   return __builtin_ia32_rcpss(a);
    107 }
    108 
    109 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    110 _mm_rcp_ps(__m128 a)
    111 {
    112   return __builtin_ia32_rcpps(a);
    113 }
    114 
    115 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    116 _mm_rsqrt_ss(__m128 a)
    117 {
    118   return __builtin_ia32_rsqrtss(a);
    119 }
    120 
    121 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    122 _mm_rsqrt_ps(__m128 a)
    123 {
    124   return __builtin_ia32_rsqrtps(a);
    125 }
    126 
    127 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    128 _mm_min_ss(__m128 a, __m128 b)
    129 {
    130   return __builtin_ia32_minss(a, b);
    131 }
    132 
    133 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    134 _mm_min_ps(__m128 a, __m128 b)
    135 {
    136   return __builtin_ia32_minps(a, b);
    137 }
    138 
    139 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    140 _mm_max_ss(__m128 a, __m128 b)
    141 {
    142   return __builtin_ia32_maxss(a, b);
    143 }
    144 
    145 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    146 _mm_max_ps(__m128 a, __m128 b)
    147 {
    148   return __builtin_ia32_maxps(a, b);
    149 }
    150 
    151 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    152 _mm_and_ps(__m128 a, __m128 b)
    153 {
    154   return (__m128)((__v4si)a & (__v4si)b);
    155 }
    156 
    157 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    158 _mm_andnot_ps(__m128 a, __m128 b)
    159 {
    160   return (__m128)(~(__v4si)a & (__v4si)b);
    161 }
    162 
    163 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    164 _mm_or_ps(__m128 a, __m128 b)
    165 {
    166   return (__m128)((__v4si)a | (__v4si)b);
    167 }
    168 
    169 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    170 _mm_xor_ps(__m128 a, __m128 b)
    171 {
    172   return (__m128)((__v4si)a ^ (__v4si)b);
    173 }
    174 
    175 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    176 _mm_cmpeq_ss(__m128 a, __m128 b)
    177 {
    178   return (__m128)__builtin_ia32_cmpss(a, b, 0);
    179 }
    180 
    181 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    182 _mm_cmpeq_ps(__m128 a, __m128 b)
    183 {
    184   return (__m128)__builtin_ia32_cmpps(a, b, 0);
    185 }
    186 
    187 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    188 _mm_cmplt_ss(__m128 a, __m128 b)
    189 {
    190   return (__m128)__builtin_ia32_cmpss(a, b, 1);
    191 }
    192 
    193 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    194 _mm_cmplt_ps(__m128 a, __m128 b)
    195 {
    196   return (__m128)__builtin_ia32_cmpps(a, b, 1);
    197 }
    198 
    199 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    200 _mm_cmple_ss(__m128 a, __m128 b)
    201 {
    202   return (__m128)__builtin_ia32_cmpss(a, b, 2);
    203 }
    204 
    205 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    206 _mm_cmple_ps(__m128 a, __m128 b)
    207 {
    208   return (__m128)__builtin_ia32_cmpps(a, b, 2);
    209 }
    210 
    211 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    212 _mm_cmpgt_ss(__m128 a, __m128 b)
    213 {
    214   return (__m128)__builtin_ia32_cmpss(b, a, 1);
    215 }
    216 
    217 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    218 _mm_cmpgt_ps(__m128 a, __m128 b)
    219 {
    220   return (__m128)__builtin_ia32_cmpps(b, a, 1);
    221 }
    222 
    223 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    224 _mm_cmpge_ss(__m128 a, __m128 b)
    225 {
    226   return (__m128)__builtin_ia32_cmpss(b, a, 2);
    227 }
    228 
    229 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    230 _mm_cmpge_ps(__m128 a, __m128 b)
    231 {
    232   return (__m128)__builtin_ia32_cmpps(b, a, 2);
    233 }
    234 
    235 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    236 _mm_cmpneq_ss(__m128 a, __m128 b)
    237 {
    238   return (__m128)__builtin_ia32_cmpss(a, b, 4);
    239 }
    240 
    241 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    242 _mm_cmpneq_ps(__m128 a, __m128 b)
    243 {
    244   return (__m128)__builtin_ia32_cmpps(a, b, 4);
    245 }
    246 
    247 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    248 _mm_cmpnlt_ss(__m128 a, __m128 b)
    249 {
    250   return (__m128)__builtin_ia32_cmpss(a, b, 5);
    251 }
    252 
    253 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    254 _mm_cmpnlt_ps(__m128 a, __m128 b)
    255 {
    256   return (__m128)__builtin_ia32_cmpps(a, b, 5);
    257 }
    258 
    259 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    260 _mm_cmpnle_ss(__m128 a, __m128 b)
    261 {
    262   return (__m128)__builtin_ia32_cmpss(a, b, 6);
    263 }
    264 
    265 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    266 _mm_cmpnle_ps(__m128 a, __m128 b)
    267 {
    268   return (__m128)__builtin_ia32_cmpps(a, b, 6);
    269 }
    270 
    271 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    272 _mm_cmpngt_ss(__m128 a, __m128 b)
    273 {
    274   return (__m128)__builtin_ia32_cmpss(b, a, 5);
    275 }
    276 
    277 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    278 _mm_cmpngt_ps(__m128 a, __m128 b)
    279 {
    280   return (__m128)__builtin_ia32_cmpps(b, a, 5);
    281 }
    282 
    283 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    284 _mm_cmpnge_ss(__m128 a, __m128 b)
    285 {
    286   return (__m128)__builtin_ia32_cmpss(b, a, 6);
    287 }
    288 
    289 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    290 _mm_cmpnge_ps(__m128 a, __m128 b)
    291 {
    292   return (__m128)__builtin_ia32_cmpps(b, a, 6);
    293 }
    294 
    295 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    296 _mm_cmpord_ss(__m128 a, __m128 b)
    297 {
    298   return (__m128)__builtin_ia32_cmpss(a, b, 7);
    299 }
    300 
    301 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    302 _mm_cmpord_ps(__m128 a, __m128 b)
    303 {
    304   return (__m128)__builtin_ia32_cmpps(a, b, 7);
    305 }
    306 
    307 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    308 _mm_cmpunord_ss(__m128 a, __m128 b)
    309 {
    310   return (__m128)__builtin_ia32_cmpss(a, b, 3);
    311 }
    312 
    313 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    314 _mm_cmpunord_ps(__m128 a, __m128 b)
    315 {
    316   return (__m128)__builtin_ia32_cmpps(a, b, 3);
    317 }
    318 
    319 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    320 _mm_comieq_ss(__m128 a, __m128 b)
    321 {
    322   return __builtin_ia32_comieq(a, b);
    323 }
    324 
    325 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    326 _mm_comilt_ss(__m128 a, __m128 b)
    327 {
    328   return __builtin_ia32_comilt(a, b);
    329 }
    330 
    331 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    332 _mm_comile_ss(__m128 a, __m128 b)
    333 {
    334   return __builtin_ia32_comile(a, b);
    335 }
    336 
    337 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    338 _mm_comigt_ss(__m128 a, __m128 b)
    339 {
    340   return __builtin_ia32_comigt(a, b);
    341 }
    342 
    343 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    344 _mm_comige_ss(__m128 a, __m128 b)
    345 {
    346   return __builtin_ia32_comige(a, b);
    347 }
    348 
    349 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    350 _mm_comineq_ss(__m128 a, __m128 b)
    351 {
    352   return __builtin_ia32_comineq(a, b);
    353 }
    354 
    355 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    356 _mm_ucomieq_ss(__m128 a, __m128 b)
    357 {
    358   return __builtin_ia32_ucomieq(a, b);
    359 }
    360 
    361 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    362 _mm_ucomilt_ss(__m128 a, __m128 b)
    363 {
    364   return __builtin_ia32_ucomilt(a, b);
    365 }
    366 
    367 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    368 _mm_ucomile_ss(__m128 a, __m128 b)
    369 {
    370   return __builtin_ia32_ucomile(a, b);
    371 }
    372 
    373 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    374 _mm_ucomigt_ss(__m128 a, __m128 b)
    375 {
    376   return __builtin_ia32_ucomigt(a, b);
    377 }
    378 
    379 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    380 _mm_ucomige_ss(__m128 a, __m128 b)
    381 {
    382   return __builtin_ia32_ucomige(a, b);
    383 }
    384 
    385 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    386 _mm_ucomineq_ss(__m128 a, __m128 b)
    387 {
    388   return __builtin_ia32_ucomineq(a, b);
    389 }
    390 
    391 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    392 _mm_cvtss_si32(__m128 a)
    393 {
    394   return __builtin_ia32_cvtss2si(a);
    395 }
    396 
    397 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    398 _mm_cvt_ss2si(__m128 a)
    399 {
    400   return _mm_cvtss_si32(a);
    401 }
    402 
    403 #ifdef __x86_64__
    404 
    405 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
    406 _mm_cvtss_si64(__m128 a)
    407 {
    408   return __builtin_ia32_cvtss2si64(a);
    409 }
    410 
    411 #endif
    412 
    413 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    414 _mm_cvtps_pi32(__m128 a)
    415 {
    416   return (__m64)__builtin_ia32_cvtps2pi(a);
    417 }
    418 
    419 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    420 _mm_cvt_ps2pi(__m128 a)
    421 {
    422   return _mm_cvtps_pi32(a);
    423 }
    424 
    425 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    426 _mm_cvttss_si32(__m128 a)
    427 {
    428   return a[0];
    429 }
    430 
    431 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    432 _mm_cvtt_ss2si(__m128 a)
    433 {
    434   return _mm_cvttss_si32(a);
    435 }
    436 
    437 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
    438 _mm_cvttss_si64(__m128 a)
    439 {
    440   return a[0];
    441 }
    442 
    443 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    444 _mm_cvttps_pi32(__m128 a)
    445 {
    446   return (__m64)__builtin_ia32_cvttps2pi(a);
    447 }
    448 
    449 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    450 _mm_cvtt_ps2pi(__m128 a)
    451 {
    452   return _mm_cvttps_pi32(a);
    453 }
    454 
    455 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    456 _mm_cvtsi32_ss(__m128 a, int b)
    457 {
    458   a[0] = b;
    459   return a;
    460 }
    461 
    462 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    463 _mm_cvt_si2ss(__m128 a, int b)
    464 {
    465   return _mm_cvtsi32_ss(a, b);
    466 }
    467 
    468 #ifdef __x86_64__
    469 
    470 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    471 _mm_cvtsi64_ss(__m128 a, long long b)
    472 {
    473   a[0] = b;
    474   return a;
    475 }
    476 
    477 #endif
    478 
    479 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    480 _mm_cvtpi32_ps(__m128 a, __m64 b)
    481 {
    482   return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
    483 }
    484 
    485 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    486 _mm_cvt_pi2ps(__m128 a, __m64 b)
    487 {
    488   return _mm_cvtpi32_ps(a, b);
    489 }
    490 
    491 static __inline__ float __attribute__((__always_inline__, __nodebug__))
    492 _mm_cvtss_f32(__m128 a)
    493 {
    494   return a[0];
    495 }
    496 
    497 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    498 _mm_loadh_pi(__m128 a, const __m64 *p)
    499 {
    500   __m128 b;
    501   b[0] = *(float*)p;
    502   b[1] = *((float*)p+1);
    503   return __builtin_shufflevector(a, b, 0, 1, 4, 5);
    504 }
    505 
    506 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    507 _mm_loadl_pi(__m128 a, const __m64 *p)
    508 {
    509   __m128 b;
    510   b[0] = *(float*)p;
    511   b[1] = *((float*)p+1);
    512   return __builtin_shufflevector(a, b, 4, 5, 2, 3);
    513 }
    514 
    515 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    516 _mm_load_ss(const float *p)
    517 {
    518   return (__m128){ *p, 0, 0, 0 };
    519 }
    520 
    521 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    522 _mm_load1_ps(const float *p)
    523 {
    524   return (__m128){ *p, *p, *p, *p };
    525 }
    526 
    527 #define        _mm_load_ps1(p) _mm_load1_ps(p)
    528 
    529 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    530 _mm_load_ps(const float *p)
    531 {
    532   return *(__m128*)p;
    533 }
    534 
    535 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    536 _mm_loadu_ps(const float *p)
    537 {
    538   return __builtin_ia32_loadups(p);
    539 }
    540 
    541 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    542 _mm_loadr_ps(const float *p)
    543 {
    544   __m128 a = _mm_load_ps(p);
    545   return __builtin_shufflevector(a, a, 3, 2, 1, 0);
    546 }
    547 
    548 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    549 _mm_set_ss(float w)
    550 {
    551   return (__m128){ w, 0, 0, 0 };
    552 }
    553 
    554 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    555 _mm_set1_ps(float w)
    556 {
    557   return (__m128){ w, w, w, w };
    558 }
    559 
    560 // Microsoft specific.
    561 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    562 _mm_set_ps1(float w)
    563 {
    564     return _mm_set1_ps(w);
    565 }
    566 
    567 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    568 _mm_set_ps(float z, float y, float x, float w)
    569 {
    570   return (__m128){ w, x, y, z };
    571 }
    572 
    573 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    574 _mm_setr_ps(float z, float y, float x, float w)
    575 {
    576   return (__m128){ z, y, x, w };
    577 }
    578 
    579 static __inline__ __m128 __attribute__((__always_inline__))
    580 _mm_setzero_ps(void)
    581 {
    582   return (__m128){ 0, 0, 0, 0 };
    583 }
    584 
    585 static __inline__ void __attribute__((__always_inline__))
    586 _mm_storeh_pi(__m64 *p, __m128 a)
    587 {
    588   __builtin_ia32_storehps((__v2si *)p, a);
    589 }
    590 
    591 static __inline__ void __attribute__((__always_inline__))
    592 _mm_storel_pi(__m64 *p, __m128 a)
    593 {
    594   __builtin_ia32_storelps((__v2si *)p, a);
    595 }
    596 
    597 static __inline__ void __attribute__((__always_inline__))
    598 _mm_store_ss(float *p, __m128 a)
    599 {
    600   *p = a[0];
    601 }
    602 
    603 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    604 _mm_storeu_ps(float *p, __m128 a)
    605 {
    606   __builtin_ia32_storeups(p, a);
    607 }
    608 
    609 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    610 _mm_store1_ps(float *p, __m128 a)
    611 {
    612   a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
    613   _mm_storeu_ps(p, a);
    614 }
    615 
    616 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    617 _mm_store_ps1(float *p, __m128 a)
    618 {
    619     return _mm_store1_ps(p, a);
    620 }
    621 
    622 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    623 _mm_store_ps(float *p, __m128 a)
    624 {
    625   *(__m128 *)p = a;
    626 }
    627 
    628 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    629 _mm_storer_ps(float *p, __m128 a)
    630 {
    631   a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
    632   _mm_store_ps(p, a);
    633 }
    634 
    635 #define _MM_HINT_T0 3
    636 #define _MM_HINT_T1 2
    637 #define _MM_HINT_T2 1
    638 #define _MM_HINT_NTA 0
    639 
    640 /* FIXME: We have to #define this because "sel" must be a constant integer, and
    641    Sema doesn't do any form of constant propagation yet. */
    642 
    643 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, sel))
    644 
    645 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    646 _mm_stream_pi(__m64 *p, __m64 a)
    647 {
    648   __builtin_ia32_movntq(p, a);
    649 }
    650 
    651 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    652 _mm_stream_ps(float *p, __m128 a)
    653 {
    654   __builtin_ia32_movntps(p, a);
    655 }
    656 
    657 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    658 _mm_sfence(void)
    659 {
    660   __builtin_ia32_sfence();
    661 }
    662 
    663 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    664 _mm_extract_pi16(__m64 a, int n)
    665 {
    666   __v4hi b = (__v4hi)a;
    667   return (unsigned short)b[n & 3];
    668 }
    669 
    670 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    671 _mm_insert_pi16(__m64 a, int d, int n)
    672 {
    673    __v4hi b = (__v4hi)a;
    674    b[n & 3] = d;
    675    return (__m64)b;
    676 }
    677 
    678 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    679 _mm_max_pi16(__m64 a, __m64 b)
    680 {
    681   return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
    682 }
    683 
    684 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    685 _mm_max_pu8(__m64 a, __m64 b)
    686 {
    687   return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
    688 }
    689 
    690 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    691 _mm_min_pi16(__m64 a, __m64 b)
    692 {
    693   return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
    694 }
    695 
    696 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    697 _mm_min_pu8(__m64 a, __m64 b)
    698 {
    699   return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
    700 }
    701 
    702 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    703 _mm_movemask_pi8(__m64 a)
    704 {
    705   return __builtin_ia32_pmovmskb((__v8qi)a);
    706 }
    707 
    708 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    709 _mm_mulhi_pu16(__m64 a, __m64 b)
    710 {
    711   return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
    712 }
    713 
    714 #define _mm_shuffle_pi16(a, n) \
    715   ((__m64)__builtin_ia32_pshufw(a, n))
    716 
    717 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    718 _mm_maskmove_si64(__m64 d, __m64 n, char *p)
    719 {
    720   __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
    721 }
    722 
    723 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    724 _mm_avg_pu8(__m64 a, __m64 b)
    725 {
    726   return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
    727 }
    728 
    729 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    730 _mm_avg_pu16(__m64 a, __m64 b)
    731 {
    732   return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
    733 }
    734 
    735 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    736 _mm_sad_pu8(__m64 a, __m64 b)
    737 {
    738   return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
    739 }
    740 
    741 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
    742 _mm_getcsr(void)
    743 {
    744   return __builtin_ia32_stmxcsr();
    745 }
    746 
    747 static __inline__ void __attribute__((__always_inline__, __nodebug__))
    748 _mm_setcsr(unsigned int i)
    749 {
    750   __builtin_ia32_ldmxcsr(i);
    751 }
    752 
    753 #define _mm_shuffle_ps(a, b, mask) \
    754         (__builtin_shufflevector((__v4sf)(a), (__v4sf)(b),                \
    755                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
    756                                  (((mask) & 0x30) >> 4) + 4, \
    757                                  (((mask) & 0xc0) >> 6) + 4))
    758 
    759 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    760 _mm_unpackhi_ps(__m128 a, __m128 b)
    761 {
    762   return __builtin_shufflevector(a, b, 2, 6, 3, 7);
    763 }
    764 
    765 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    766 _mm_unpacklo_ps(__m128 a, __m128 b)
    767 {
    768   return __builtin_shufflevector(a, b, 0, 4, 1, 5);
    769 }
    770 
    771 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    772 _mm_move_ss(__m128 a, __m128 b)
    773 {
    774   return __builtin_shufflevector(a, b, 4, 1, 2, 3);
    775 }
    776 
    777 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    778 _mm_movehl_ps(__m128 a, __m128 b)
    779 {
    780   return __builtin_shufflevector(a, b, 6, 7, 2, 3);
    781 }
    782 
    783 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    784 _mm_movelh_ps(__m128 a, __m128 b)
    785 {
    786   return __builtin_shufflevector(a, b, 0, 1, 4, 5);
    787 }
    788 
    789 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    790 _mm_cvtpi16_ps(__m64 a)
    791 {
    792   __m64 b, c;
    793   __m128 r;
    794 
    795   b = _mm_setzero_si64();
    796   b = _mm_cmpgt_pi16(b, a);
    797   c = _mm_unpackhi_pi16(a, b);
    798   r = _mm_setzero_ps();
    799   r = _mm_cvtpi32_ps(r, c);
    800   r = _mm_movelh_ps(r, r);
    801   c = _mm_unpacklo_pi16(a, b);
    802   r = _mm_cvtpi32_ps(r, c);
    803 
    804   return r;
    805 }
    806 
    807 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    808 _mm_cvtpu16_ps(__m64 a)
    809 {
    810   __m64 b, c;
    811   __m128 r;
    812 
    813   b = _mm_setzero_si64();
    814   c = _mm_unpackhi_pi16(a, b);
    815   r = _mm_setzero_ps();
    816   r = _mm_cvtpi32_ps(r, c);
    817   r = _mm_movelh_ps(r, r);
    818   c = _mm_unpacklo_pi16(a, b);
    819   r = _mm_cvtpi32_ps(r, c);
    820 
    821   return r;
    822 }
    823 
    824 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    825 _mm_cvtpi8_ps(__m64 a)
    826 {
    827   __m64 b;
    828 
    829   b = _mm_setzero_si64();
    830   b = _mm_cmpgt_pi8(b, a);
    831   b = _mm_unpacklo_pi8(a, b);
    832 
    833   return _mm_cvtpi16_ps(b);
    834 }
    835 
    836 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    837 _mm_cvtpu8_ps(__m64 a)
    838 {
    839   __m64 b;
    840 
    841   b = _mm_setzero_si64();
    842   b = _mm_unpacklo_pi8(a, b);
    843 
    844   return _mm_cvtpi16_ps(b);
    845 }
    846 
    847 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    848 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
    849 {
    850   __m128 c;
    851 
    852   c = _mm_setzero_ps();
    853   c = _mm_cvtpi32_ps(c, b);
    854   c = _mm_movelh_ps(c, c);
    855 
    856   return _mm_cvtpi32_ps(c, a);
    857 }
    858 
    859 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    860 _mm_cvtps_pi16(__m128 a)
    861 {
    862   __m64 b, c;
    863 
    864   b = _mm_cvtps_pi32(a);
    865   a = _mm_movehl_ps(a, a);
    866   c = _mm_cvtps_pi32(a);
    867 
    868   return _mm_packs_pi16(b, c);
    869 }
    870 
    871 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
    872 _mm_cvtps_pi8(__m128 a)
    873 {
    874   __m64 b, c;
    875 
    876   b = _mm_cvtps_pi16(a);
    877   c = _mm_setzero_si64();
    878 
    879   return _mm_packs_pi16(b, c);
    880 }
    881 
    882 static __inline__ int __attribute__((__always_inline__, __nodebug__))
    883 _mm_movemask_ps(__m128 a)
    884 {
    885   return __builtin_ia32_movmskps(a);
    886 }
    887 
    888 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
    889 
    890 #define _MM_EXCEPT_INVALID    (0x0001)
    891 #define _MM_EXCEPT_DENORM     (0x0002)
    892 #define _MM_EXCEPT_DIV_ZERO   (0x0004)
    893 #define _MM_EXCEPT_OVERFLOW   (0x0008)
    894 #define _MM_EXCEPT_UNDERFLOW  (0x0010)
    895 #define _MM_EXCEPT_INEXACT    (0x0020)
    896 #define _MM_EXCEPT_MASK       (0x003f)
    897 
    898 #define _MM_MASK_INVALID      (0x0080)
    899 #define _MM_MASK_DENORM       (0x0100)
    900 #define _MM_MASK_DIV_ZERO     (0x0200)
    901 #define _MM_MASK_OVERFLOW     (0x0400)
    902 #define _MM_MASK_UNDERFLOW    (0x0800)
    903 #define _MM_MASK_INEXACT      (0x1000)
    904 #define _MM_MASK_MASK         (0x1f80)
    905 
    906 #define _MM_ROUND_NEAREST     (0x0000)
    907 #define _MM_ROUND_DOWN        (0x2000)
    908 #define _MM_ROUND_UP          (0x4000)
    909 #define _MM_ROUND_TOWARD_ZERO (0x6000)
    910 #define _MM_ROUND_MASK        (0x6000)
    911 
    912 #define _MM_FLUSH_ZERO_MASK   (0x8000)
    913 #define _MM_FLUSH_ZERO_ON     (0x8000)
    914 #define _MM_FLUSH_ZERO_OFF    (0x8000)
    915 
    916 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
    917 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
    918 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
    919 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
    920 
    921 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
    922 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
    923 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
    924 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
    925 
    926 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
    927 do { \
    928   __m128 tmp3, tmp2, tmp1, tmp0; \
    929   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
    930   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
    931   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
    932   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
    933   (row0) = _mm_movelh_ps(tmp0, tmp2); \
    934   (row1) = _mm_movehl_ps(tmp2, tmp0); \
    935   (row2) = _mm_movelh_ps(tmp1, tmp3); \
    936   (row3) = _mm_movehl_ps(tmp3, tmp1); \
    937 } while (0)
    938 
    939 /* Aliases for compatibility. */
    940 #define _m_pextrw _mm_extract_pi16
    941 #define _m_pinsrw _mm_insert_pi16
    942 #define _m_pmaxsw _mm_max_pi16
    943 #define _m_pmaxub _mm_max_pu8
    944 #define _m_pminsw _mm_min_pi16
    945 #define _m_pminub _mm_min_pu8
    946 #define _m_pmovmskb _mm_movemask_pi8
    947 #define _m_pmulhuw _mm_mulhi_pu16
    948 #define _m_pshufw _mm_shuffle_pi16
    949 #define _m_maskmovq _mm_maskmove_si64
    950 #define _m_pavgb _mm_avg_pu8
    951 #define _m_pavgw _mm_avg_pu16
    952 #define _m_psadbw _mm_sad_pu8
    953 #define _m_ _mm_
    954 #define _m_ _mm_
    955 
    956 /* Ugly hack for backwards-compatibility (compatible with gcc) */
    957 #ifdef __SSE2__
    958 #include <emmintrin.h>
    959 #endif
    960 
    961 #endif /* __SSE__ */
    962 
    963 #endif /* __XMMINTRIN_H */
    964