Home | History | Annotate | Download | only in loongson
      1 /*
      2  * Loongson MMI optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
      5  *                          All Rights Reserved.
      6  * Copyright (C) 2019, D. R. Commander.  All Rights Reserved.
      7  *
      8  * This software is provided 'as-is', without any express or implied
      9  * warranty.  In no event will the authors be held liable for any damages
     10  * arising from the use of this software.
     11  *
     12  * Permission is granted to anyone to use this software for any purpose,
     13  * including commercial applications, and to alter it and redistribute it
     14  * freely, subject to the following restrictions:
     15  *
     16  * 1. The origin of this software must not be misrepresented; you must not
     17  *    claim that you wrote the original software. If you use this software
     18  *    in a product, an acknowledgment in the product documentation would be
     19  *    appreciated but is not required.
     20  * 2. Altered source versions must be plainly marked as such, and must not be
     21  *    misrepresented as being the original software.
     22  * 3. This notice may not be removed or altered from any source distribution.
     23  */
     24 
     25 #ifndef __LOONGSON_MMINTRIN_H__
     26 #define __LOONGSON_MMINTRIN_H__
     27 
     28 #include <stdint.h>
     29 
     30 
     31 #define FUNCTION_ATTRIBS \
     32   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     33 
     34 
     35 /* Vectors are stored in 64-bit floating-point registers. */
     36 typedef double __m64;
     37 
     38 /* Having a 32-bit datatype allows us to use 32-bit loads in places like
     39    load8888. */
     40 typedef float __m32;
     41 
     42 
     43 /********** Set Operations **********/
     44 
     45 extern __inline __m64 FUNCTION_ATTRIBS
     46 _mm_setzero_si64(void)
     47 {
     48   return 0.0;
     49 }
     50 
     51 extern __inline __m64 FUNCTION_ATTRIBS
     52 _mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
     53             uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
     54 {
     55   __m64 ret;
     56   uint32_t lo = ((uint32_t)__b6 << 24) |
     57                 ((uint32_t)__b4 << 16) |
     58                 ((uint32_t)__b2 << 8) |
     59                 (uint32_t)__b0;
     60   uint32_t hi = ((uint32_t)__b7 << 24) |
     61                 ((uint32_t)__b5 << 16) |
     62                 ((uint32_t)__b3 << 8) |
     63                 (uint32_t)__b1;
     64 
     65   asm("mtc1      %1, %0\n\t"
     66       "mtc1      %2, $f0\n\t"
     67       "punpcklbh %0, %0, $f0\n\t"
     68       : "=f" (ret)
     69       : "r" (lo), "r" (hi)
     70       : "$f0"
     71      );
     72 
     73   return ret;
     74 }
     75 
     76 extern __inline __m64 FUNCTION_ATTRIBS
     77 _mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
     78 {
     79   __m64 ret;
     80   uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
     81   uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
     82 
     83   asm("mtc1      %1, %0\n\t"
     84       "mtc1      %2, $f0\n\t"
     85       "punpcklhw %0, %0, $f0\n\t"
     86       : "=f" (ret)
     87       : "r" (lo), "r" (hi)
     88       : "$f0"
     89      );
     90 
     91   return ret;
     92 }
     93 
     94 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
     95   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
     96 
     97 extern __inline __m64 FUNCTION_ATTRIBS
     98 _mm_set_pi32(uint32_t __i1, uint32_t __i0)
     99 {
    100   if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
    101     uint64_t val = ((uint64_t)__i1 << 32) |
    102                    ((uint64_t)__i0 <<  0);
    103 
    104     return *(__m64 *)&val;
    105   } else if (__i1 == __i0) {
    106     uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
    107     __m64 ret;
    108 
    109     asm("pshufh %0, %1, %2\n\t"
    110         : "=f" (ret)
    111         : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
    112        );
    113 
    114     return ret;
    115   } else {
    116     uint64_t val = ((uint64_t)__i1 << 32) |
    117                    ((uint64_t)__i0 <<  0);
    118 
    119     return *(__m64 *)&val;
    120   }
    121 }
    122 
    123 extern __inline __m64 FUNCTION_ATTRIBS
    124 _mm_set1_pi8(uint8_t __b0)
    125 {
    126   __m64 ret;
    127 
    128   asm("sll    $8, %1, 8\n\t"
    129       "or     %1, %1, $8\n\t"
    130       "mtc1   %1, %0\n\t"
    131       "mtc1   $0, $f0\n\t"
    132       "pshufh %0, %0, $f0\n\t"
    133       : "=f" (ret)
    134       : "r" (__b0)
    135       : "$8", "$f0"
    136      );
    137 
    138   return ret;
    139 }
    140 
    141 extern __inline __m64 FUNCTION_ATTRIBS
    142 _mm_set1_pi16(uint16_t __h0)
    143 {
    144   __m64 ret;
    145 
    146   asm("mtc1   %1, %0\n\t"
    147       "mtc1   $0, $f0\n\t"
    148       "pshufh %0, %0, $f0\n\t"
    149       : "=f" (ret)
    150       : "r" (__h0)
    151       : "$8", "$f0"
    152      );
    153 
    154   return ret;
    155 }
    156 
    157 extern __inline __m64 FUNCTION_ATTRIBS
    158 _mm_set1_pi32(unsigned __i0)
    159 {
    160   return _mm_set_pi32(__i0, __i0);
    161 }
    162 
    163 extern __inline __m64 FUNCTION_ATTRIBS
    164 _mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
    165              uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
    166 {
    167   return _mm_set_pi8(__h7, __h6, __h5, __h4,
    168                      __h3, __h2, __h1, __h0);
    169 }
    170 
    171 extern __inline __m64 FUNCTION_ATTRIBS
    172 _mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
    173 {
    174   return _mm_set_pi16(__w3, __w2, __w1, __w0);
    175 }
    176 
    177 extern __inline __m64 FUNCTION_ATTRIBS
    178 _mm_setr_pi32(uint32_t __i0, uint32_t __i1)
    179 {
    180   return _mm_set_pi32(__i1, __i0);
    181 }
    182 
    183 
    184 /********** Arithmetic Operations **********/
    185 
    186 extern __inline __m64 FUNCTION_ATTRIBS
    187 _mm_add_pi8(__m64 __m1, __m64 __m2)
    188 {
    189   __m64 ret;
    190 
    191   asm("paddb %0, %1, %2\n\t"
    192       : "=f" (ret)
    193       : "f" (__m1), "f" (__m2)
    194      );
    195 
    196   return ret;
    197 }
    198 
    199 extern __inline __m64 FUNCTION_ATTRIBS
    200 _mm_add_pi16(__m64 __m1, __m64 __m2)
    201 {
    202   __m64 ret;
    203 
    204   asm("paddh %0, %1, %2\n\t"
    205       : "=f" (ret)
    206       : "f" (__m1), "f" (__m2)
    207      );
    208 
    209   return ret;
    210 }
    211 
    212 extern __inline __m64 FUNCTION_ATTRIBS
    213 _mm_add_pi32(__m64 __m1, __m64 __m2)
    214 {
    215   __m64 ret;
    216 
    217   asm("paddw %0, %1, %2\n\t"
    218       : "=f" (ret)
    219       : "f" (__m1), "f" (__m2)
    220      );
    221 
    222   return ret;
    223 }
    224 
    225 extern __inline __m64 FUNCTION_ATTRIBS
    226 _mm_add_si64(__m64 __m1, __m64 __m2)
    227 {
    228   __m64 ret;
    229 
    230   asm("paddd %0, %1, %2\n\t"
    231       : "=f" (ret)
    232       : "f" (__m1), "f" (__m2)
    233      );
    234 
    235   return ret;
    236 }
    237 
    238 extern __inline __m64 FUNCTION_ATTRIBS
    239 _mm_adds_pi8(__m64 __m1, __m64 __m2)
    240 {
    241   __m64 ret;
    242 
    243   asm("paddsb %0, %1, %2\n\t"
    244       : "=f" (ret)
    245       : "f" (__m1), "f" (__m2)
    246      );
    247 
    248   return ret;
    249 }
    250 
    251 extern __inline __m64 FUNCTION_ATTRIBS
    252 _mm_adds_pi16(__m64 __m1, __m64 __m2)
    253 {
    254   __m64 ret;
    255 
    256   asm("paddsh %0, %1, %2\n\t"
    257       : "=f" (ret)
    258       : "f" (__m1), "f" (__m2)
    259      );
    260 
    261   return ret;
    262 }
    263 
    264 
    265 extern __inline __m64 FUNCTION_ATTRIBS
    266 _mm_adds_pu8(__m64 __m1, __m64 __m2)
    267 {
    268   __m64 ret;
    269 
    270   asm("paddusb %0, %1, %2\n\t"
    271       : "=f" (ret)
    272       : "f" (__m1), "f" (__m2)
    273      );
    274 
    275   return ret;
    276 }
    277 
    278 extern __inline __m64 FUNCTION_ATTRIBS
    279 _mm_adds_pu16(__m64 __m1, __m64 __m2)
    280 {
    281   __m64 ret;
    282 
    283   asm("paddush %0, %1, %2\n\t"
    284       : "=f" (ret)
    285       : "f" (__m1), "f" (__m2)
    286      );
    287 
    288   return ret;
    289 }
    290 
    291 extern __inline __m64 FUNCTION_ATTRIBS
    292 _mm_avg_pu8(__m64 __m1, __m64 __m2)
    293 {
    294   __m64 ret;
    295 
    296   asm("pavgb %0, %1, %2\n\t"
    297       : "=f" (ret)
    298       : "f" (__m1), "f" (__m2)
    299      );
    300 
    301   return ret;
    302 }
    303 
    304 extern __inline __m64 FUNCTION_ATTRIBS
    305 _mm_avg_pu16(__m64 __m1, __m64 __m2)
    306 {
    307   __m64 ret;
    308 
    309   asm("pavgh %0, %1, %2\n\t"
    310       : "=f" (ret)
    311       : "f" (__m1), "f" (__m2)
    312      );
    313 
    314   return ret;
    315 }
    316 
    317 extern __inline __m64 FUNCTION_ATTRIBS
    318 _mm_madd_pi16(__m64 __m1, __m64 __m2)
    319 {
    320   __m64 ret;
    321 
    322   asm("pmaddhw %0, %1, %2\n\t"
    323       : "=f" (ret)
    324       : "f" (__m1), "f" (__m2)
    325      );
    326 
    327   return ret;
    328 }
    329 
    330 extern __inline __m64 FUNCTION_ATTRIBS
    331 _mm_max_pi16(__m64 __m1, __m64 __m2)
    332 {
    333   __m64 ret;
    334 
    335   asm("pmaxsh %0, %1, %2\n\t"
    336       : "=f" (ret)
    337       : "f" (__m1), "f" (__m2)
    338      );
    339 
    340   return ret;
    341 }
    342 
    343 extern __inline __m64 FUNCTION_ATTRIBS
    344 _mm_max_pu8(__m64 __m1, __m64 __m2)
    345 {
    346   __m64 ret;
    347 
    348   asm("pmaxub %0, %1, %2\n\t"
    349       : "=f" (ret)
    350       : "f" (__m1), "f" (__m2)
    351      );
    352 
    353   return ret;
    354 }
    355 
    356 extern __inline __m64 FUNCTION_ATTRIBS
    357 _mm_min_pi16(__m64 __m1, __m64 __m2)
    358 {
    359   __m64 ret;
    360 
    361   asm("pminsh %0, %1, %2\n\t"
    362       : "=f" (ret)
    363       : "f" (__m1), "f" (__m2)
    364      );
    365 
    366   return ret;
    367 }
    368 
    369 extern __inline __m64 FUNCTION_ATTRIBS
    370 _mm_min_pu8(__m64 __m1, __m64 __m2)
    371 {
    372   __m64 ret;
    373 
    374   asm("pminub %0, %1, %2\n\t"
    375       : "=f" (ret)
    376       : "f" (__m1), "f" (__m2)
    377      );
    378 
    379   return ret;
    380 }
    381 
    382 extern __inline int FUNCTION_ATTRIBS
    383 _mm_movemask_pi8(__m64 __m1)
    384 {
    385   int ret;
    386 
    387   asm("pmovmskb %0, %1\n\t"
    388       : "=r" (ret)
    389       : "y" (__m1)
    390      );
    391 
    392   return ret;
    393 }
    394 
    395 extern __inline __m64 FUNCTION_ATTRIBS
    396 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
    397 {
    398   __m64 ret;
    399 
    400   asm("pmulhh %0, %1, %2\n\t"
    401       : "=f" (ret)
    402       : "f" (__m1), "f" (__m2)
    403      );
    404 
    405   return ret;
    406 }
    407 
    408 extern __inline __m64 FUNCTION_ATTRIBS
    409 _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
    410 {
    411   __m64 ret;
    412 
    413   asm("pmulhuh %0, %1, %2\n\t"
    414       : "=f" (ret)
    415       : "f" (__m1), "f" (__m2)
    416      );
    417 
    418   return ret;
    419 }
    420 
    421 extern __inline __m64 FUNCTION_ATTRIBS
    422 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
    423 {
    424   __m64 ret;
    425 
    426   asm("pmullh %0, %1, %2\n\t"
    427       : "=f" (ret)
    428       : "f" (__m1), "f" (__m2)
    429      );
    430 
    431   return ret;
    432 }
    433 
    434 extern __inline __m64 FUNCTION_ATTRIBS
    435 _mm_mul_pu32(__m64 __m1, __m64 __m2)
    436 {
    437   __m64 ret;
    438 
    439   asm("pmuluw %0, %1, %2\n\t"
    440       : "=f" (ret)
    441       : "f" (__m1), "f" (__m2)
    442      );
    443 
    444   return ret;
    445 }
    446 
    447 extern __inline __m64 FUNCTION_ATTRIBS
    448 _mm_sad_pu8(__m64 __m1, __m64 __m2)
    449 {
    450   __m64 ret;
    451 
    452   asm("psadbh %0, %1, %2\n\t"
    453       : "=f" (ret)
    454       : "f" (__m1), "f" (__m2)
    455      );
    456 
    457   return ret;
    458 }
    459 
    460 
    461 extern __inline __m64 FUNCTION_ATTRIBS
    462 _mm_asub_pu8(__m64 __m1, __m64 __m2)
    463 {
    464   __m64 ret;
    465 
    466   asm("pasubub %0, %1, %2\n\t"
    467       : "=f" (ret)
    468       : "f" (__m1), "f" (__m2)
    469      );
    470 
    471   return ret;
    472 }
    473 
    474 extern __inline __m64 FUNCTION_ATTRIBS
    475 _mm_biadd_pu8(__m64 __m1, __m64 __m2)
    476 {
    477   __m64 ret;
    478 
    479   asm("biadd %0, %1, %2\n\t"
    480       : "=f" (ret)
    481       : "f" (__m1), "f" (__m2)
    482      );
    483 
    484   return ret;
    485 }
    486 
    487 extern __inline __m64 FUNCTION_ATTRIBS
    488 _mm_sub_pi8(__m64 __m1, __m64 __m2)
    489 {
    490   __m64 ret;
    491 
    492   asm("psubb %0, %1, %2\n\t"
    493       : "=f" (ret)
    494       : "f" (__m1), "f" (__m2)
    495      );
    496 
    497   return ret;
    498 }
    499 
    500 extern __inline __m64 FUNCTION_ATTRIBS
    501 _mm_sub_pi16(__m64 __m1, __m64 __m2)
    502 {
    503   __m64 ret;
    504 
    505   asm("psubh %0, %1, %2\n\t"
    506       : "=f" (ret)
    507       : "f" (__m1), "f" (__m2)
    508      );
    509 
    510   return ret;
    511 }
    512 
    513 extern __inline __m64 FUNCTION_ATTRIBS
    514 _mm_sub_pi32(__m64 __m1, __m64 __m2)
    515 {
    516   __m64 ret;
    517 
    518   asm("psubw %0, %1, %2\n\t"
    519       : "=f" (ret)
    520       : "f" (__m1), "f" (__m2)
    521      );
    522 
    523   return ret;
    524 }
    525 
    526 extern __inline __m64 FUNCTION_ATTRIBS
    527 _mm_sub_si64(__m64 __m1, __m64 __m2)
    528 {
    529   __m64 ret;
    530 
    531   asm("psubd %0, %1, %2\n\t"
    532       : "=f" (ret)
    533       : "f" (__m1), "f" (__m2)
    534      );
    535 
    536   return ret;
    537 }
    538 
    539 extern __inline __m64 FUNCTION_ATTRIBS
    540 _mm_subs_pi8(__m64 __m1, __m64 __m2)
    541 {
    542   __m64 ret;
    543 
    544   asm("psubsb %0, %1, %2\n\t"
    545       : "=f" (ret)
    546       : "f" (__m1), "f" (__m2)
    547      );
    548 
    549   return ret;
    550 }
    551 
    552 extern __inline __m64 FUNCTION_ATTRIBS
    553 _mm_subs_pi16(__m64 __m1, __m64 __m2)
    554 {
    555   __m64 ret;
    556 
    557   asm("psubsh %0, %1, %2\n\t"
    558       : "=f" (ret)
    559       : "f" (__m1), "f" (__m2)
    560      );
    561 
    562   return ret;
    563 }
    564 
    565 
    566 extern __inline __m64 FUNCTION_ATTRIBS
    567 _mm_subs_pu8(__m64 __m1, __m64 __m2)
    568 {
    569   __m64 ret;
    570 
    571   asm("psubusb %0, %1, %2\n\t"
    572       : "=f" (ret)
    573       : "f" (__m1), "f" (__m2)
    574      );
    575 
    576   return ret;
    577 }
    578 
    579 extern __inline __m64 FUNCTION_ATTRIBS
    580 _mm_subs_pu16(__m64 __m1, __m64 __m2)
    581 {
    582   __m64 ret;
    583 
    584   asm("psubush %0, %1, %2\n\t"
    585       : "=f" (ret)
    586       : "f" (__m1), "f" (__m2)
    587      );
    588 
    589   return ret;
    590 }
    591 
    592 
    593 /********** Logical Operations **********/
    594 
    595 extern __inline __m64 FUNCTION_ATTRIBS
    596 _mm_and_si64(__m64 __m1, __m64 __m2)
    597 {
    598   __m64 ret;
    599 
    600   asm("and %0, %1, %2\n\t"
    601       : "=f" (ret)
    602       : "f" (__m1), "f" (__m2)
    603      );
    604 
    605   return ret;
    606 }
    607 
    608 extern __inline __m64 FUNCTION_ATTRIBS
    609 _mm_andnot_si64(__m64 __m1, __m64 __m2)
    610 {
    611   __m64 ret;
    612 
    613   asm("andn %0, %1, %2\n\t"
    614       : "=f" (ret)
    615       : "f" (__m1), "f" (__m2)
    616      );
    617 
    618   return ret;
    619 }
    620 
    621 
    622 extern __inline __m64 FUNCTION_ATTRIBS
    623 _mm_or_si32(__m32 __m1, __m32 __m2)
    624 {
    625   __m32 ret;
    626 
    627   asm("or %0, %1, %2\n\t"
    628       : "=f" (ret)
    629       : "f" (__m1), "f" (__m2)
    630      );
    631 
    632   return ret;
    633 }
    634 
    635 extern __inline __m64 FUNCTION_ATTRIBS
    636 _mm_or_si64(__m64 __m1, __m64 __m2)
    637 {
    638   __m64 ret;
    639 
    640   asm("or %0, %1, %2\n\t"
    641       : "=f" (ret)
    642       : "f" (__m1), "f" (__m2)
    643      );
    644 
    645   return ret;
    646 }
    647 
    648 extern __inline __m64 FUNCTION_ATTRIBS
    649 _mm_xor_si64(__m64 __m1, __m64 __m2)
    650 {
    651   __m64 ret;
    652 
    653   asm("xor %0, %1, %2\n\t"
    654       : "=f" (ret)
    655       : "f" (__m1), "f" (__m2)
    656      );
    657 
    658   return ret;
    659 }
    660 
    661 
    662 /********** Shift Operations **********/
    663 
    664 extern __inline __m64 FUNCTION_ATTRIBS
    665 _mm_slli_pi16(__m64 __m, int64_t __count)
    666 {
    667   __m64 ret;
    668 
    669   asm("psllh  %0, %1, %2\n\t"
    670       : "=f" (ret)
    671       : "f" (__m), "f" (*(__m64 *)&__count)
    672      );
    673 
    674   return ret;
    675 }
    676 
    677 extern __inline __m64 FUNCTION_ATTRIBS
    678 _mm_slli_pi32(__m64 __m, int64_t __count)
    679 {
    680   __m64 ret;
    681 
    682   asm("psllw %0, %1, %2\n\t"
    683       : "=f" (ret)
    684       : "f" (__m), "f" (*(__m64 *)&__count)
    685      );
    686 
    687   return ret;
    688 }
    689 
    690 extern __inline __m64 FUNCTION_ATTRIBS
    691 _mm_slli_si64(__m64 __m, int64_t __count)
    692 {
    693   __m64 ret;
    694 
    695   asm("dsll  %0, %1, %2\n\t"
    696       : "=f" (ret)
    697       : "f" (__m), "f" (*(__m64 *)&__count)
    698      );
    699 
    700   return ret;
    701 }
    702 
    703 extern __inline __m64 FUNCTION_ATTRIBS
    704 _mm_srli_pi16(__m64 __m, int64_t __count)
    705 {
    706   __m64 ret;
    707 
    708   asm("psrlh %0, %1, %2\n\t"
    709       : "=f" (ret)
    710       : "f" (__m), "f" (*(__m64 *)&__count)
    711      );
    712 
    713   return ret;
    714 }
    715 
    716 extern __inline __m64 FUNCTION_ATTRIBS
    717 _mm_srli_pi32(__m64 __m, int64_t __count)
    718 {
    719   __m64 ret;
    720 
    721   asm("psrlw %0, %1, %2\n\t"
    722       : "=f" (ret)
    723       : "f" (__m), "f" (*(__m64 *)&__count)
    724      );
    725 
    726   return ret;
    727 }
    728 
    729 extern __inline __m64 FUNCTION_ATTRIBS
    730 _mm_srli_si64(__m64 __m, int64_t __count)
    731 {
    732   __m64 ret;
    733 
    734   asm("dsrl  %0, %1, %2\n\t"
    735       : "=f" (ret)
    736       : "f" (__m), "f" (*(__m64 *)&__count)
    737      );
    738 
    739   return ret;
    740 }
    741 
    742 extern __inline __m64 FUNCTION_ATTRIBS
    743 _mm_srai_pi16(__m64 __m, int64_t __count)
    744 {
    745   __m64 ret;
    746 
    747   asm("psrah %0, %1, %2\n\t"
    748       : "=f" (ret)
    749       : "f" (__m), "f" (*(__m64 *)&__count)
    750      );
    751 
    752   return ret;
    753 }
    754 
    755 extern __inline __m64 FUNCTION_ATTRIBS
    756 _mm_srai_pi32(__m64 __m, int64_t __count)
    757 {
    758   __m64 ret;
    759 
    760   asm("psraw %0, %1, %2\n\t"
    761       : "=f" (ret)
    762       : "f" (__m), "f" (*(__m64 *)&__count)
    763      );
    764 
    765   return ret;
    766 }
    767 
    768 extern __inline __m64 FUNCTION_ATTRIBS
    769 _mm_srai_si64(__m64 __m, int64_t __count)
    770 {
    771   __m64 ret;
    772 
    773   asm("dsra %0, %1, %2\n\t"
    774       : "=f" (ret)
    775       : "f" (__m), "f" (*(__m64 *)&__count)
    776      );
    777 
    778   return ret;
    779 }
    780 
    781 
    782 /********** Conversion Intrinsics **********/
    783 
    784 extern __inline __m64 FUNCTION_ATTRIBS
    785 to_m64(uint64_t x)
    786 {
    787   return *(__m64 *)&x;
    788 }
    789 
    790 extern __inline uint64_t FUNCTION_ATTRIBS
    791 to_uint64(__m64 x)
    792 {
    793   return *(uint64_t *)&x;
    794 }
    795 
    796 
    797 /********** Comparison Intrinsics **********/
    798 
    799 extern __inline __m64 FUNCTION_ATTRIBS
    800 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
    801 {
    802   __m64 ret;
    803 
    804   asm("pcmpeqb %0, %1, %2\n\t"
    805       : "=f" (ret)
    806       : "f" (__m1), "f" (__m2)
    807      );
    808 
    809   return ret;
    810 }
    811 
    812 extern __inline __m64 FUNCTION_ATTRIBS
    813 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
    814 {
    815   __m64 ret;
    816 
    817   asm("pcmpeqh %0, %1, %2\n\t"
    818       : "=f" (ret)
    819       : "f" (__m1), "f" (__m2)
    820      );
    821 
    822   return ret;
    823 }
    824 
    825 extern __inline __m64 FUNCTION_ATTRIBS
    826 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
    827 {
    828   __m64 ret;
    829 
    830   asm("pcmpeqw %0, %1, %2\n\t"
    831       : "=f" (ret)
    832       : "f" (__m1), "f" (__m2)
    833      );
    834 
    835   return ret;
    836 }
    837 
    838 extern __inline __m64 FUNCTION_ATTRIBS
    839 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
    840 {
    841   __m64 ret;
    842 
    843   asm("pcmpgtb %0, %1, %2\n\t"
    844       : "=f" (ret)
    845       : "f" (__m1), "f" (__m2)
    846      );
    847 
    848   return ret;
    849 }
    850 
    851 extern __inline __m64 FUNCTION_ATTRIBS
    852 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
    853 {
    854   __m64 ret;
    855 
    856   asm("pcmpgth %0, %1, %2\n\t"
    857       : "=f" (ret)
    858       : "f" (__m1), "f" (__m2)
    859      );
    860 
    861   return ret;
    862 }
    863 
    864 extern __inline __m64 FUNCTION_ATTRIBS
    865 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
    866 {
    867   __m64 ret;
    868 
    869   asm("pcmpgtw %0, %1, %2\n\t"
    870       : "=f" (ret)
    871       : "f" (__m1), "f" (__m2)
    872      );
    873 
    874   return ret;
    875 }
    876 
    877 extern __inline __m64 FUNCTION_ATTRIBS
    878 _mm_cmplt_pi8(__m64 __m1, __m64 __m2)
    879 {
    880   __m64 ret;
    881 
    882   asm("pcmpltb %0, %1, %2\n\t"
    883       : "=f" (ret)
    884       : "f" (__m1), "f" (__m2)
    885      );
    886 
    887   return ret;
    888 }
    889 
    890 extern __inline __m64 FUNCTION_ATTRIBS
    891 _mm_cmplt_pi16(__m64 __m1, __m64 __m2)
    892 {
    893   __m64 ret;
    894 
    895   asm("pcmplth %0, %1, %2\n\t"
    896       : "=f" (ret)
    897       : "f" (__m1), "f" (__m2)
    898      );
    899 
    900   return ret;
    901 }
    902 
    903 extern __inline __m64 FUNCTION_ATTRIBS
    904 _mm_cmplt_pi32(__m64 __m1, __m64 __m2)
    905 {
    906   __m64 ret;
    907 
    908   asm("pcmpltw %0, %1, %2\n\t"
    909       : "=f" (ret)
    910       : "f" (__m1), "f" (__m2)
    911      );
    912 
    913   return ret;
    914 }
    915 
    916 
    917 /********** Miscellaneous Operations **********/
    918 
    919 extern __inline __m64 FUNCTION_ATTRIBS
    920 _mm_packs_pi16(__m64 __m1, __m64 __m2)
    921 {
    922   __m64 ret;
    923 
    924   asm("packsshb %0, %1, %2\n\t"
    925       : "=f" (ret)
    926       : "f" (__m1), "f" (__m2)
    927      );
    928 
    929   return ret;
    930 }
    931 
    932 extern __inline __m64 FUNCTION_ATTRIBS
    933 _mm_packs_pi32(__m64 __m1, __m64 __m2)
    934 {
    935   __m64 ret;
    936 
    937   asm("packsswh %0, %1, %2\n\t"
    938       : "=f" (ret)
    939       : "f" (__m1), "f" (__m2)
    940      );
    941 
    942   return ret;
    943 }
    944 
    945 extern __inline __m64 FUNCTION_ATTRIBS
    946 _mm_packs_pi32_f(__m64 __m1, __m64 __m2)
    947 {
    948   __m64 ret;
    949 
    950   asm("packsswh %0, %1, %2\n\t"
    951       : "=f" (ret)
    952       : "f" (__m1), "f" (__m2)
    953      );
    954 
    955   return ret;
    956 }
    957 
    958 extern __inline __m64 FUNCTION_ATTRIBS
    959 _mm_packs_pu16(__m64 __m1, __m64 __m2)
    960 {
    961   __m64 ret;
    962 
    963   asm("packushb %0, %1, %2\n\t"
    964       : "=f" (ret)
    965       : "f" (__m1), "f" (__m2)
    966      );
    967 
    968   return ret;
    969 }
    970 
    971 extern __inline __m64 FUNCTION_ATTRIBS
    972 _mm_extract_pi16(__m64 __m, int64_t __pos)
    973 {
    974   __m64 ret;
    975 
    976   asm("pextrh %0, %1, %2\n\t"
    977       : "=f" (ret)
    978       : "f" (__m), "f" (*(__m64 *)&__pos)
    979      );
    980 
    981   return ret;
    982 }
    983 
    984 extern __inline __m64 FUNCTION_ATTRIBS
    985 _mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
    986 {
    987   __m64 ret;
    988 
    989   switch (__pos) {
    990   case 0:
    991 
    992     asm("pinsrh_0 %0, %1, %2\n\t"
    993         : "=f" (ret)
    994         : "f" (__m1), "f" (__m2), "i" (__pos)
    995        );
    996 
    997     break;
    998 
    999   case 1:
   1000 
   1001     asm("pinsrh_1 %0, %1, %2\n\t"
   1002         : "=f" (ret)
   1003         : "f" (__m1), "f" (__m2), "i" (__pos)
   1004        );
   1005 
   1006     break;
   1007   case 2:
   1008 
   1009     asm("pinsrh_2 %0, %1, %2\n\t"
   1010         : "=f" (ret)
   1011         : "f" (__m1), "f" (__m2), "i" (__pos)
   1012        );
   1013 
   1014     break;
   1015 
   1016   case 3:
   1017 
   1018     asm("pinsrh_3 %0, %1, %2\n\t"
   1019         : "=f" (ret)
   1020         : "f" (__m1), "f" (__m2), "i" (__pos)
   1021        );
   1022 
   1023     break;
   1024   }
   1025 
   1026   return ret;
   1027 }
   1028 
   1029 extern __inline __m64 FUNCTION_ATTRIBS
   1030 _mm_shuffle_pi16(__m64 __m, int64_t __n)
   1031 {
   1032   __m64 ret;
   1033 
   1034   asm("pshufh %0, %1, %2\n\t"
   1035       : "=f" (ret)
   1036       : "f" (__m), "f" (*(__m64 *)&__n)
   1037      );
   1038 
   1039   return ret;
   1040 }
   1041 
   1042 extern __inline __m64 FUNCTION_ATTRIBS
   1043 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
   1044 {
   1045   __m64 ret;
   1046 
   1047   asm("punpckhbh %0, %1, %2\n\t"
   1048       : "=f" (ret)
   1049       : "f" (__m1), "f" (__m2)
   1050      );
   1051 
   1052   return ret;
   1053 }
   1054 
   1055 extern __inline __m64 FUNCTION_ATTRIBS
   1056 _mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
   1057 {
   1058   __m64 ret;
   1059 
   1060   asm("punpckhbh %0, %1, %2\n\t"
   1061       : "=f" (ret)
   1062       : "f" (__m1), "f" (__m2)
   1063      );
   1064 
   1065   return ret;
   1066 }
   1067 
   1068 extern __inline __m64 FUNCTION_ATTRIBS
   1069 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
   1070 {
   1071   __m64 ret;
   1072 
   1073   asm("punpckhhw %0, %1, %2\n\t"
   1074       : "=f" (ret)
   1075       : "f" (__m1), "f" (__m2)
   1076      );
   1077 
   1078   return ret;
   1079 }
   1080 
   1081 extern __inline __m64 FUNCTION_ATTRIBS
   1082 _mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
   1083 {
   1084   __m64 ret;
   1085 
   1086   asm("punpckhhw %0, %1, %2\n\t"
   1087       : "=f" (ret)
   1088       : "f" (__m1), "f" (__m2)
   1089      );
   1090 
   1091   return ret;
   1092 }
   1093 
   1094 extern __inline __m64 FUNCTION_ATTRIBS
   1095 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
   1096 {
   1097   __m64 ret;
   1098 
   1099   asm("punpckhwd %0, %1, %2\n\t"
   1100       : "=f" (ret)
   1101       : "f" (__m1), "f" (__m2)
   1102      );
   1103 
   1104   return ret;
   1105 }
   1106 
   1107 extern __inline __m64 FUNCTION_ATTRIBS
   1108 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
   1109 {
   1110   __m64 ret;
   1111 
   1112   asm("punpcklbh %0, %1, %2\n\t"
   1113       : "=f" (ret)
   1114       : "f" (__m1), "f" (__m2)
   1115      );
   1116 
   1117   return ret;
   1118 }
   1119 
   1120 /* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
   1121    which preserves the data. */
   1122 
   1123 extern __inline __m64 FUNCTION_ATTRIBS
   1124 _mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
   1125 {
   1126   __m64 ret;
   1127 
   1128   asm("punpcklbh %0, %1, %2\n\t"
   1129       : "=f" (ret)
   1130       : "f" (__m1), "f" (__m2)
   1131      );
   1132 
   1133   return ret;
   1134 }
   1135 
   1136 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
   1137    datatype, which allows load8888 to use 32-bit loads. */
   1138 
   1139 extern __inline __m64 FUNCTION_ATTRIBS
   1140 _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
   1141 {
   1142   __m64 ret;
   1143 
   1144   asm("punpcklbh %0, %1, %2\n\t"
   1145       : "=f" (ret)
   1146       : "f" (__m1), "f" (__m2)
   1147      );
   1148 
   1149   return ret;
   1150 }
   1151 
   1152 extern __inline __m64 FUNCTION_ATTRIBS
   1153 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
   1154 {
   1155   __m64 ret;
   1156 
   1157   asm("punpcklhw %0, %1, %2\n\t"
   1158       : "=f" (ret)
   1159       : "f" (__m1), "f" (__m2)
   1160      );
   1161 
   1162   return ret;
   1163 }
   1164 
   1165 extern __inline __m64 FUNCTION_ATTRIBS
   1166 _mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
   1167 {
   1168   __m64 ret;
   1169 
   1170   asm("punpcklhw %0, %1, %2\n\t"
   1171       : "=f" (ret)
   1172       : "f" (__m1), "f" (__m2)
   1173      );
   1174 
   1175   return ret;
   1176 }
   1177 
   1178 extern __inline __m64 FUNCTION_ATTRIBS
   1179 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
   1180 {
   1181   __m64 ret;
   1182 
   1183   asm("punpcklwd %0, %1, %2\n\t"
   1184       : "=f" (ret)
   1185       : "f" (__m1), "f" (__m2)
   1186      );
   1187 
   1188   return ret;
   1189 }
   1190 
   1191 
   1192 extern __inline __m64 FUNCTION_ATTRIBS
   1193 _mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
   1194 {
   1195   __m64 ret;
   1196 
   1197   asm("punpcklwd %0, %1, %2\n\t"
   1198       : "=f" (ret)
   1199       : "f" (__m1), "f" (__m2)
   1200      );
   1201 
   1202   return ret;
   1203 }
   1204 
   1205 extern __inline void FUNCTION_ATTRIBS
   1206 _mm_store_pi32(__m32 *dest, __m64 src)
   1207 {
   1208   src = _mm_packs_pu16(src, _mm_setzero_si64());
   1209 
   1210   asm("swc1 %1, %0\n\t"
   1211       : "=m" (*dest)
   1212       : "f" (src)
   1213       : "memory"
   1214      );
   1215 }
   1216 
   1217 extern __inline void FUNCTION_ATTRIBS
   1218 _mm_store_si64(__m64 *dest, __m64 src)
   1219 {
   1220   asm("gssdlc1 %1, 7+%0\n\t"
   1221       "gssdrc1 %1, %0\n\t"
   1222       : "=m" (*dest)
   1223       : "f" (src)
   1224       : "memory"
   1225      );
   1226 }
   1227 
   1228 extern __inline __m64 FUNCTION_ATTRIBS
   1229 _mm_load_si32(const __m32 *src)
   1230 {
   1231   __m32 ret;
   1232 
   1233   asm("lwc1 %0, %1\n\t"
   1234       : "=f" (ret)
   1235       : "m" (*src)
   1236      );
   1237 
   1238   return ret;
   1239 }
   1240 
   1241 extern __inline __m64 FUNCTION_ATTRIBS
   1242 _mm_load_si64(const __m64 *src)
   1243 {
   1244   __m64 ret;
   1245 
   1246   asm("ldc1 %0, %1\n\t"
   1247       : "=f" (ret)
   1248       : "m" (*src)
   1249       : "memory"
   1250      );
   1251 
   1252   return ret;
   1253 }
   1254 
   1255 extern __inline __m64 FUNCTION_ATTRIBS
   1256 _mm_loadu_si64(const __m64 *src)
   1257 {
   1258   __m64 ret;
   1259 
   1260   asm("gsldlc1 %0,  7(%1)\n\t"
   1261       "gsldrc1 %0,  0(%1)\n\t"
   1262       : "=f" (ret)
   1263       : "r" (src)
   1264       : "memory"
   1265      );
   1266 
   1267   return ret;
   1268 }
   1269 
   1270 extern __inline __m64 FUNCTION_ATTRIBS
   1271 _mm_loadlo_pi8(const uint32_t *src)
   1272 {
   1273   return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
   1274 }
   1275 
   1276 extern __inline __m64 FUNCTION_ATTRIBS
   1277 _mm_loadlo_pi8_f(__m64 src)
   1278 {
   1279   return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
   1280 }
   1281 
   1282 extern __inline __m64 FUNCTION_ATTRIBS
   1283 _mm_loadhi_pi8_f(__m64 src)
   1284 {
   1285   return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
   1286 }
   1287 
   1288 extern __inline __m64 FUNCTION_ATTRIBS
   1289 _mm_loadlo_pi16(__m64 src)
   1290 {
   1291   return _mm_unpacklo_pi16(src, _mm_setzero_si64());
   1292 }
   1293 
   1294 extern __inline __m64 FUNCTION_ATTRIBS
   1295 _mm_loadlo_pi16_f(__m64 src)
   1296 {
   1297   return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
   1298 }
   1299 
   1300 extern __inline __m64 FUNCTION_ATTRIBS
   1301 _mm_loadhi_pi16(__m64 src)
   1302 {
   1303   return _mm_unpackhi_pi16(src, _mm_setzero_si64());
   1304 }
   1305 
   1306 extern __inline __m64 FUNCTION_ATTRIBS
   1307 _mm_loadhi_pi16_f(__m64 src)
   1308 {
   1309   return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
   1310 }
   1311 
   1312 extern __inline __m64 FUNCTION_ATTRIBS
   1313 _mm_expand_alpha(__m64 pixel)
   1314 {
   1315   return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
   1316 }
   1317 
   1318 extern __inline __m64 FUNCTION_ATTRIBS
   1319 _mm_expand_alpha_rev(__m64 pixel)
   1320 {
   1321   return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
   1322 }
   1323 
   1324 #endif  /* __LOONGSON_MMINTRIN_H__ */
   1325