1 /* 2 * Loongson MMI optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. 5 * All Rights Reserved. 6 * Copyright (C) 2019, D. R. Commander. All Rights Reserved. 7 * 8 * This software is provided 'as-is', without any express or implied 9 * warranty. In no event will the authors be held liable for any damages 10 * arising from the use of this software. 11 * 12 * Permission is granted to anyone to use this software for any purpose, 13 * including commercial applications, and to alter it and redistribute it 14 * freely, subject to the following restrictions: 15 * 16 * 1. The origin of this software must not be misrepresented; you must not 17 * claim that you wrote the original software. If you use this software 18 * in a product, an acknowledgment in the product documentation would be 19 * appreciated but is not required. 20 * 2. Altered source versions must be plainly marked as such, and must not be 21 * misrepresented as being the original software. 22 * 3. This notice may not be removed or altered from any source distribution. 23 */ 24 25 #ifndef __LOONGSON_MMINTRIN_H__ 26 #define __LOONGSON_MMINTRIN_H__ 27 28 #include <stdint.h> 29 30 31 #define FUNCTION_ATTRIBS \ 32 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 33 34 35 /* Vectors are stored in 64-bit floating-point registers. */ 36 typedef double __m64; 37 38 /* Having a 32-bit datatype allows us to use 32-bit loads in places like 39 load8888. */ 40 typedef float __m32; 41 42 43 /********** Set Operations **********/ 44 45 extern __inline __m64 FUNCTION_ATTRIBS 46 _mm_setzero_si64(void) 47 { 48 return 0.0; 49 } 50 51 extern __inline __m64 FUNCTION_ATTRIBS 52 _mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4, 53 uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0) 54 { 55 __m64 ret; 56 uint32_t lo = ((uint32_t)__b6 << 24) | 57 ((uint32_t)__b4 << 16) | 58 ((uint32_t)__b2 << 8) | 59 (uint32_t)__b0; 60 uint32_t hi = ((uint32_t)__b7 << 24) | 61 ((uint32_t)__b5 << 16) | 62 ((uint32_t)__b3 << 8) | 63 (uint32_t)__b1; 64 65 asm("mtc1 %1, %0\n\t" 66 "mtc1 %2, $f0\n\t" 67 "punpcklbh %0, %0, $f0\n\t" 68 : "=f" (ret) 69 : "r" (lo), "r" (hi) 70 : "$f0" 71 ); 72 73 return ret; 74 } 75 76 extern __inline __m64 FUNCTION_ATTRIBS 77 _mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0) 78 { 79 __m64 ret; 80 uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0; 81 uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1; 82 83 asm("mtc1 %1, %0\n\t" 84 "mtc1 %2, $f0\n\t" 85 "punpcklhw %0, %0, $f0\n\t" 86 : "=f" (ret) 87 : "r" (lo), "r" (hi) 88 : "$f0" 89 ); 90 91 return ret; 92 } 93 94 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ 95 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 96 97 extern __inline __m64 FUNCTION_ATTRIBS 98 _mm_set_pi32(uint32_t __i1, uint32_t __i0) 99 { 100 if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) { 101 uint64_t val = ((uint64_t)__i1 << 32) | 102 ((uint64_t)__i0 << 0); 103 104 return *(__m64 *)&val; 105 } else if (__i1 == __i0) { 106 uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0); 107 __m64 ret; 108 109 asm("pshufh %0, %1, %2\n\t" 110 : "=f" (ret) 111 : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm) 112 ); 113 114 return ret; 115 } else { 116 uint64_t val = ((uint64_t)__i1 << 32) | 117 ((uint64_t)__i0 << 0); 118 119 return *(__m64 *)&val; 120 } 121 } 122 123 extern __inline __m64 FUNCTION_ATTRIBS 124 _mm_set1_pi8(uint8_t __b0) 125 { 126 __m64 ret; 127 128 asm("sll $8, %1, 8\n\t" 129 "or %1, %1, $8\n\t" 130 "mtc1 %1, %0\n\t" 131 "mtc1 $0, $f0\n\t" 132 "pshufh %0, %0, $f0\n\t" 133 : "=f" (ret) 134 : "r" (__b0) 135 : "$8", "$f0" 136 ); 137 138 return ret; 139 } 140 141 extern __inline __m64 FUNCTION_ATTRIBS 142 _mm_set1_pi16(uint16_t __h0) 143 { 144 __m64 ret; 145 146 asm("mtc1 %1, %0\n\t" 147 "mtc1 $0, $f0\n\t" 148 "pshufh %0, %0, $f0\n\t" 149 : "=f" (ret) 150 : "r" (__h0) 151 : "$8", "$f0" 152 ); 153 154 return ret; 155 } 156 157 extern __inline __m64 FUNCTION_ATTRIBS 158 _mm_set1_pi32(unsigned __i0) 159 { 160 return _mm_set_pi32(__i0, __i0); 161 } 162 163 extern __inline __m64 FUNCTION_ATTRIBS 164 _mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3, 165 uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7) 166 { 167 return _mm_set_pi8(__h7, __h6, __h5, __h4, 168 __h3, __h2, __h1, __h0); 169 } 170 171 extern __inline __m64 FUNCTION_ATTRIBS 172 _mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3) 173 { 174 return _mm_set_pi16(__w3, __w2, __w1, __w0); 175 } 176 177 extern __inline __m64 FUNCTION_ATTRIBS 178 _mm_setr_pi32(uint32_t __i0, uint32_t __i1) 179 { 180 return _mm_set_pi32(__i1, __i0); 181 } 182 183 184 /********** Arithmetic Operations **********/ 185 186 extern __inline __m64 FUNCTION_ATTRIBS 187 _mm_add_pi8(__m64 __m1, __m64 __m2) 188 { 189 __m64 ret; 190 191 asm("paddb %0, %1, %2\n\t" 192 : "=f" (ret) 193 : "f" (__m1), "f" (__m2) 194 ); 195 196 return ret; 197 } 198 199 extern __inline __m64 FUNCTION_ATTRIBS 200 _mm_add_pi16(__m64 __m1, __m64 __m2) 201 { 202 __m64 ret; 203 204 asm("paddh %0, %1, %2\n\t" 205 : "=f" (ret) 206 : "f" (__m1), "f" (__m2) 207 ); 208 209 return ret; 210 } 211 212 extern __inline __m64 FUNCTION_ATTRIBS 213 _mm_add_pi32(__m64 __m1, __m64 __m2) 214 { 215 __m64 ret; 216 217 asm("paddw %0, %1, %2\n\t" 218 : "=f" (ret) 219 : "f" (__m1), "f" (__m2) 220 ); 221 222 return ret; 223 } 224 225 extern __inline __m64 FUNCTION_ATTRIBS 226 _mm_add_si64(__m64 __m1, __m64 __m2) 227 { 228 __m64 ret; 229 230 asm("paddd %0, %1, %2\n\t" 231 : "=f" (ret) 232 : "f" (__m1), "f" (__m2) 233 ); 234 235 return ret; 236 } 237 238 extern __inline __m64 FUNCTION_ATTRIBS 239 _mm_adds_pi8(__m64 __m1, __m64 __m2) 240 { 241 __m64 ret; 242 243 asm("paddsb %0, %1, %2\n\t" 244 : "=f" (ret) 245 : "f" (__m1), "f" (__m2) 246 ); 247 248 return ret; 249 } 250 251 extern __inline __m64 FUNCTION_ATTRIBS 252 _mm_adds_pi16(__m64 __m1, __m64 __m2) 253 { 254 __m64 ret; 255 256 asm("paddsh %0, %1, %2\n\t" 257 : "=f" (ret) 258 : "f" (__m1), "f" (__m2) 259 ); 260 261 return ret; 262 } 263 264 265 extern __inline __m64 FUNCTION_ATTRIBS 266 _mm_adds_pu8(__m64 __m1, __m64 __m2) 267 { 268 __m64 ret; 269 270 asm("paddusb %0, %1, %2\n\t" 271 : "=f" (ret) 272 : "f" (__m1), "f" (__m2) 273 ); 274 275 return ret; 276 } 277 278 extern __inline __m64 FUNCTION_ATTRIBS 279 _mm_adds_pu16(__m64 __m1, __m64 __m2) 280 { 281 __m64 ret; 282 283 asm("paddush %0, %1, %2\n\t" 284 : "=f" (ret) 285 : "f" (__m1), "f" (__m2) 286 ); 287 288 return ret; 289 } 290 291 extern __inline __m64 FUNCTION_ATTRIBS 292 _mm_avg_pu8(__m64 __m1, __m64 __m2) 293 { 294 __m64 ret; 295 296 asm("pavgb %0, %1, %2\n\t" 297 : "=f" (ret) 298 : "f" (__m1), "f" (__m2) 299 ); 300 301 return ret; 302 } 303 304 extern __inline __m64 FUNCTION_ATTRIBS 305 _mm_avg_pu16(__m64 __m1, __m64 __m2) 306 { 307 __m64 ret; 308 309 asm("pavgh %0, %1, %2\n\t" 310 : "=f" (ret) 311 : "f" (__m1), "f" (__m2) 312 ); 313 314 return ret; 315 } 316 317 extern __inline __m64 FUNCTION_ATTRIBS 318 _mm_madd_pi16(__m64 __m1, __m64 __m2) 319 { 320 __m64 ret; 321 322 asm("pmaddhw %0, %1, %2\n\t" 323 : "=f" (ret) 324 : "f" (__m1), "f" (__m2) 325 ); 326 327 return ret; 328 } 329 330 extern __inline __m64 FUNCTION_ATTRIBS 331 _mm_max_pi16(__m64 __m1, __m64 __m2) 332 { 333 __m64 ret; 334 335 asm("pmaxsh %0, %1, %2\n\t" 336 : "=f" (ret) 337 : "f" (__m1), "f" (__m2) 338 ); 339 340 return ret; 341 } 342 343 extern __inline __m64 FUNCTION_ATTRIBS 344 _mm_max_pu8(__m64 __m1, __m64 __m2) 345 { 346 __m64 ret; 347 348 asm("pmaxub %0, %1, %2\n\t" 349 : "=f" (ret) 350 : "f" (__m1), "f" (__m2) 351 ); 352 353 return ret; 354 } 355 356 extern __inline __m64 FUNCTION_ATTRIBS 357 _mm_min_pi16(__m64 __m1, __m64 __m2) 358 { 359 __m64 ret; 360 361 asm("pminsh %0, %1, %2\n\t" 362 : "=f" (ret) 363 : "f" (__m1), "f" (__m2) 364 ); 365 366 return ret; 367 } 368 369 extern __inline __m64 FUNCTION_ATTRIBS 370 _mm_min_pu8(__m64 __m1, __m64 __m2) 371 { 372 __m64 ret; 373 374 asm("pminub %0, %1, %2\n\t" 375 : "=f" (ret) 376 : "f" (__m1), "f" (__m2) 377 ); 378 379 return ret; 380 } 381 382 extern __inline int FUNCTION_ATTRIBS 383 _mm_movemask_pi8(__m64 __m1) 384 { 385 int ret; 386 387 asm("pmovmskb %0, %1\n\t" 388 : "=r" (ret) 389 : "y" (__m1) 390 ); 391 392 return ret; 393 } 394 395 extern __inline __m64 FUNCTION_ATTRIBS 396 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) 397 { 398 __m64 ret; 399 400 asm("pmulhh %0, %1, %2\n\t" 401 : "=f" (ret) 402 : "f" (__m1), "f" (__m2) 403 ); 404 405 return ret; 406 } 407 408 extern __inline __m64 FUNCTION_ATTRIBS 409 _mm_mulhi_pu16(__m64 __m1, __m64 __m2) 410 { 411 __m64 ret; 412 413 asm("pmulhuh %0, %1, %2\n\t" 414 : "=f" (ret) 415 : "f" (__m1), "f" (__m2) 416 ); 417 418 return ret; 419 } 420 421 extern __inline __m64 FUNCTION_ATTRIBS 422 _mm_mullo_pi16(__m64 __m1, __m64 __m2) 423 { 424 __m64 ret; 425 426 asm("pmullh %0, %1, %2\n\t" 427 : "=f" (ret) 428 : "f" (__m1), "f" (__m2) 429 ); 430 431 return ret; 432 } 433 434 extern __inline __m64 FUNCTION_ATTRIBS 435 _mm_mul_pu32(__m64 __m1, __m64 __m2) 436 { 437 __m64 ret; 438 439 asm("pmuluw %0, %1, %2\n\t" 440 : "=f" (ret) 441 : "f" (__m1), "f" (__m2) 442 ); 443 444 return ret; 445 } 446 447 extern __inline __m64 FUNCTION_ATTRIBS 448 _mm_sad_pu8(__m64 __m1, __m64 __m2) 449 { 450 __m64 ret; 451 452 asm("psadbh %0, %1, %2\n\t" 453 : "=f" (ret) 454 : "f" (__m1), "f" (__m2) 455 ); 456 457 return ret; 458 } 459 460 461 extern __inline __m64 FUNCTION_ATTRIBS 462 _mm_asub_pu8(__m64 __m1, __m64 __m2) 463 { 464 __m64 ret; 465 466 asm("pasubub %0, %1, %2\n\t" 467 : "=f" (ret) 468 : "f" (__m1), "f" (__m2) 469 ); 470 471 return ret; 472 } 473 474 extern __inline __m64 FUNCTION_ATTRIBS 475 _mm_biadd_pu8(__m64 __m1, __m64 __m2) 476 { 477 __m64 ret; 478 479 asm("biadd %0, %1, %2\n\t" 480 : "=f" (ret) 481 : "f" (__m1), "f" (__m2) 482 ); 483 484 return ret; 485 } 486 487 extern __inline __m64 FUNCTION_ATTRIBS 488 _mm_sub_pi8(__m64 __m1, __m64 __m2) 489 { 490 __m64 ret; 491 492 asm("psubb %0, %1, %2\n\t" 493 : "=f" (ret) 494 : "f" (__m1), "f" (__m2) 495 ); 496 497 return ret; 498 } 499 500 extern __inline __m64 FUNCTION_ATTRIBS 501 _mm_sub_pi16(__m64 __m1, __m64 __m2) 502 { 503 __m64 ret; 504 505 asm("psubh %0, %1, %2\n\t" 506 : "=f" (ret) 507 : "f" (__m1), "f" (__m2) 508 ); 509 510 return ret; 511 } 512 513 extern __inline __m64 FUNCTION_ATTRIBS 514 _mm_sub_pi32(__m64 __m1, __m64 __m2) 515 { 516 __m64 ret; 517 518 asm("psubw %0, %1, %2\n\t" 519 : "=f" (ret) 520 : "f" (__m1), "f" (__m2) 521 ); 522 523 return ret; 524 } 525 526 extern __inline __m64 FUNCTION_ATTRIBS 527 _mm_sub_si64(__m64 __m1, __m64 __m2) 528 { 529 __m64 ret; 530 531 asm("psubd %0, %1, %2\n\t" 532 : "=f" (ret) 533 : "f" (__m1), "f" (__m2) 534 ); 535 536 return ret; 537 } 538 539 extern __inline __m64 FUNCTION_ATTRIBS 540 _mm_subs_pi8(__m64 __m1, __m64 __m2) 541 { 542 __m64 ret; 543 544 asm("psubsb %0, %1, %2\n\t" 545 : "=f" (ret) 546 : "f" (__m1), "f" (__m2) 547 ); 548 549 return ret; 550 } 551 552 extern __inline __m64 FUNCTION_ATTRIBS 553 _mm_subs_pi16(__m64 __m1, __m64 __m2) 554 { 555 __m64 ret; 556 557 asm("psubsh %0, %1, %2\n\t" 558 : "=f" (ret) 559 : "f" (__m1), "f" (__m2) 560 ); 561 562 return ret; 563 } 564 565 566 extern __inline __m64 FUNCTION_ATTRIBS 567 _mm_subs_pu8(__m64 __m1, __m64 __m2) 568 { 569 __m64 ret; 570 571 asm("psubusb %0, %1, %2\n\t" 572 : "=f" (ret) 573 : "f" (__m1), "f" (__m2) 574 ); 575 576 return ret; 577 } 578 579 extern __inline __m64 FUNCTION_ATTRIBS 580 _mm_subs_pu16(__m64 __m1, __m64 __m2) 581 { 582 __m64 ret; 583 584 asm("psubush %0, %1, %2\n\t" 585 : "=f" (ret) 586 : "f" (__m1), "f" (__m2) 587 ); 588 589 return ret; 590 } 591 592 593 /********** Logical Operations **********/ 594 595 extern __inline __m64 FUNCTION_ATTRIBS 596 _mm_and_si64(__m64 __m1, __m64 __m2) 597 { 598 __m64 ret; 599 600 asm("and %0, %1, %2\n\t" 601 : "=f" (ret) 602 : "f" (__m1), "f" (__m2) 603 ); 604 605 return ret; 606 } 607 608 extern __inline __m64 FUNCTION_ATTRIBS 609 _mm_andnot_si64(__m64 __m1, __m64 __m2) 610 { 611 __m64 ret; 612 613 asm("andn %0, %1, %2\n\t" 614 : "=f" (ret) 615 : "f" (__m1), "f" (__m2) 616 ); 617 618 return ret; 619 } 620 621 622 extern __inline __m64 FUNCTION_ATTRIBS 623 _mm_or_si32(__m32 __m1, __m32 __m2) 624 { 625 __m32 ret; 626 627 asm("or %0, %1, %2\n\t" 628 : "=f" (ret) 629 : "f" (__m1), "f" (__m2) 630 ); 631 632 return ret; 633 } 634 635 extern __inline __m64 FUNCTION_ATTRIBS 636 _mm_or_si64(__m64 __m1, __m64 __m2) 637 { 638 __m64 ret; 639 640 asm("or %0, %1, %2\n\t" 641 : "=f" (ret) 642 : "f" (__m1), "f" (__m2) 643 ); 644 645 return ret; 646 } 647 648 extern __inline __m64 FUNCTION_ATTRIBS 649 _mm_xor_si64(__m64 __m1, __m64 __m2) 650 { 651 __m64 ret; 652 653 asm("xor %0, %1, %2\n\t" 654 : "=f" (ret) 655 : "f" (__m1), "f" (__m2) 656 ); 657 658 return ret; 659 } 660 661 662 /********** Shift Operations **********/ 663 664 extern __inline __m64 FUNCTION_ATTRIBS 665 _mm_slli_pi16(__m64 __m, int64_t __count) 666 { 667 __m64 ret; 668 669 asm("psllh %0, %1, %2\n\t" 670 : "=f" (ret) 671 : "f" (__m), "f" (*(__m64 *)&__count) 672 ); 673 674 return ret; 675 } 676 677 extern __inline __m64 FUNCTION_ATTRIBS 678 _mm_slli_pi32(__m64 __m, int64_t __count) 679 { 680 __m64 ret; 681 682 asm("psllw %0, %1, %2\n\t" 683 : "=f" (ret) 684 : "f" (__m), "f" (*(__m64 *)&__count) 685 ); 686 687 return ret; 688 } 689 690 extern __inline __m64 FUNCTION_ATTRIBS 691 _mm_slli_si64(__m64 __m, int64_t __count) 692 { 693 __m64 ret; 694 695 asm("dsll %0, %1, %2\n\t" 696 : "=f" (ret) 697 : "f" (__m), "f" (*(__m64 *)&__count) 698 ); 699 700 return ret; 701 } 702 703 extern __inline __m64 FUNCTION_ATTRIBS 704 _mm_srli_pi16(__m64 __m, int64_t __count) 705 { 706 __m64 ret; 707 708 asm("psrlh %0, %1, %2\n\t" 709 : "=f" (ret) 710 : "f" (__m), "f" (*(__m64 *)&__count) 711 ); 712 713 return ret; 714 } 715 716 extern __inline __m64 FUNCTION_ATTRIBS 717 _mm_srli_pi32(__m64 __m, int64_t __count) 718 { 719 __m64 ret; 720 721 asm("psrlw %0, %1, %2\n\t" 722 : "=f" (ret) 723 : "f" (__m), "f" (*(__m64 *)&__count) 724 ); 725 726 return ret; 727 } 728 729 extern __inline __m64 FUNCTION_ATTRIBS 730 _mm_srli_si64(__m64 __m, int64_t __count) 731 { 732 __m64 ret; 733 734 asm("dsrl %0, %1, %2\n\t" 735 : "=f" (ret) 736 : "f" (__m), "f" (*(__m64 *)&__count) 737 ); 738 739 return ret; 740 } 741 742 extern __inline __m64 FUNCTION_ATTRIBS 743 _mm_srai_pi16(__m64 __m, int64_t __count) 744 { 745 __m64 ret; 746 747 asm("psrah %0, %1, %2\n\t" 748 : "=f" (ret) 749 : "f" (__m), "f" (*(__m64 *)&__count) 750 ); 751 752 return ret; 753 } 754 755 extern __inline __m64 FUNCTION_ATTRIBS 756 _mm_srai_pi32(__m64 __m, int64_t __count) 757 { 758 __m64 ret; 759 760 asm("psraw %0, %1, %2\n\t" 761 : "=f" (ret) 762 : "f" (__m), "f" (*(__m64 *)&__count) 763 ); 764 765 return ret; 766 } 767 768 extern __inline __m64 FUNCTION_ATTRIBS 769 _mm_srai_si64(__m64 __m, int64_t __count) 770 { 771 __m64 ret; 772 773 asm("dsra %0, %1, %2\n\t" 774 : "=f" (ret) 775 : "f" (__m), "f" (*(__m64 *)&__count) 776 ); 777 778 return ret; 779 } 780 781 782 /********** Conversion Intrinsics **********/ 783 784 extern __inline __m64 FUNCTION_ATTRIBS 785 to_m64(uint64_t x) 786 { 787 return *(__m64 *)&x; 788 } 789 790 extern __inline uint64_t FUNCTION_ATTRIBS 791 to_uint64(__m64 x) 792 { 793 return *(uint64_t *)&x; 794 } 795 796 797 /********** Comparison Intrinsics **********/ 798 799 extern __inline __m64 FUNCTION_ATTRIBS 800 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) 801 { 802 __m64 ret; 803 804 asm("pcmpeqb %0, %1, %2\n\t" 805 : "=f" (ret) 806 : "f" (__m1), "f" (__m2) 807 ); 808 809 return ret; 810 } 811 812 extern __inline __m64 FUNCTION_ATTRIBS 813 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) 814 { 815 __m64 ret; 816 817 asm("pcmpeqh %0, %1, %2\n\t" 818 : "=f" (ret) 819 : "f" (__m1), "f" (__m2) 820 ); 821 822 return ret; 823 } 824 825 extern __inline __m64 FUNCTION_ATTRIBS 826 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) 827 { 828 __m64 ret; 829 830 asm("pcmpeqw %0, %1, %2\n\t" 831 : "=f" (ret) 832 : "f" (__m1), "f" (__m2) 833 ); 834 835 return ret; 836 } 837 838 extern __inline __m64 FUNCTION_ATTRIBS 839 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) 840 { 841 __m64 ret; 842 843 asm("pcmpgtb %0, %1, %2\n\t" 844 : "=f" (ret) 845 : "f" (__m1), "f" (__m2) 846 ); 847 848 return ret; 849 } 850 851 extern __inline __m64 FUNCTION_ATTRIBS 852 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) 853 { 854 __m64 ret; 855 856 asm("pcmpgth %0, %1, %2\n\t" 857 : "=f" (ret) 858 : "f" (__m1), "f" (__m2) 859 ); 860 861 return ret; 862 } 863 864 extern __inline __m64 FUNCTION_ATTRIBS 865 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) 866 { 867 __m64 ret; 868 869 asm("pcmpgtw %0, %1, %2\n\t" 870 : "=f" (ret) 871 : "f" (__m1), "f" (__m2) 872 ); 873 874 return ret; 875 } 876 877 extern __inline __m64 FUNCTION_ATTRIBS 878 _mm_cmplt_pi8(__m64 __m1, __m64 __m2) 879 { 880 __m64 ret; 881 882 asm("pcmpltb %0, %1, %2\n\t" 883 : "=f" (ret) 884 : "f" (__m1), "f" (__m2) 885 ); 886 887 return ret; 888 } 889 890 extern __inline __m64 FUNCTION_ATTRIBS 891 _mm_cmplt_pi16(__m64 __m1, __m64 __m2) 892 { 893 __m64 ret; 894 895 asm("pcmplth %0, %1, %2\n\t" 896 : "=f" (ret) 897 : "f" (__m1), "f" (__m2) 898 ); 899 900 return ret; 901 } 902 903 extern __inline __m64 FUNCTION_ATTRIBS 904 _mm_cmplt_pi32(__m64 __m1, __m64 __m2) 905 { 906 __m64 ret; 907 908 asm("pcmpltw %0, %1, %2\n\t" 909 : "=f" (ret) 910 : "f" (__m1), "f" (__m2) 911 ); 912 913 return ret; 914 } 915 916 917 /********** Miscellaneous Operations **********/ 918 919 extern __inline __m64 FUNCTION_ATTRIBS 920 _mm_packs_pi16(__m64 __m1, __m64 __m2) 921 { 922 __m64 ret; 923 924 asm("packsshb %0, %1, %2\n\t" 925 : "=f" (ret) 926 : "f" (__m1), "f" (__m2) 927 ); 928 929 return ret; 930 } 931 932 extern __inline __m64 FUNCTION_ATTRIBS 933 _mm_packs_pi32(__m64 __m1, __m64 __m2) 934 { 935 __m64 ret; 936 937 asm("packsswh %0, %1, %2\n\t" 938 : "=f" (ret) 939 : "f" (__m1), "f" (__m2) 940 ); 941 942 return ret; 943 } 944 945 extern __inline __m64 FUNCTION_ATTRIBS 946 _mm_packs_pi32_f(__m64 __m1, __m64 __m2) 947 { 948 __m64 ret; 949 950 asm("packsswh %0, %1, %2\n\t" 951 : "=f" (ret) 952 : "f" (__m1), "f" (__m2) 953 ); 954 955 return ret; 956 } 957 958 extern __inline __m64 FUNCTION_ATTRIBS 959 _mm_packs_pu16(__m64 __m1, __m64 __m2) 960 { 961 __m64 ret; 962 963 asm("packushb %0, %1, %2\n\t" 964 : "=f" (ret) 965 : "f" (__m1), "f" (__m2) 966 ); 967 968 return ret; 969 } 970 971 extern __inline __m64 FUNCTION_ATTRIBS 972 _mm_extract_pi16(__m64 __m, int64_t __pos) 973 { 974 __m64 ret; 975 976 asm("pextrh %0, %1, %2\n\t" 977 : "=f" (ret) 978 : "f" (__m), "f" (*(__m64 *)&__pos) 979 ); 980 981 return ret; 982 } 983 984 extern __inline __m64 FUNCTION_ATTRIBS 985 _mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos) 986 { 987 __m64 ret; 988 989 switch (__pos) { 990 case 0: 991 992 asm("pinsrh_0 %0, %1, %2\n\t" 993 : "=f" (ret) 994 : "f" (__m1), "f" (__m2), "i" (__pos) 995 ); 996 997 break; 998 999 case 1: 1000 1001 asm("pinsrh_1 %0, %1, %2\n\t" 1002 : "=f" (ret) 1003 : "f" (__m1), "f" (__m2), "i" (__pos) 1004 ); 1005 1006 break; 1007 case 2: 1008 1009 asm("pinsrh_2 %0, %1, %2\n\t" 1010 : "=f" (ret) 1011 : "f" (__m1), "f" (__m2), "i" (__pos) 1012 ); 1013 1014 break; 1015 1016 case 3: 1017 1018 asm("pinsrh_3 %0, %1, %2\n\t" 1019 : "=f" (ret) 1020 : "f" (__m1), "f" (__m2), "i" (__pos) 1021 ); 1022 1023 break; 1024 } 1025 1026 return ret; 1027 } 1028 1029 extern __inline __m64 FUNCTION_ATTRIBS 1030 _mm_shuffle_pi16(__m64 __m, int64_t __n) 1031 { 1032 __m64 ret; 1033 1034 asm("pshufh %0, %1, %2\n\t" 1035 : "=f" (ret) 1036 : "f" (__m), "f" (*(__m64 *)&__n) 1037 ); 1038 1039 return ret; 1040 } 1041 1042 extern __inline __m64 FUNCTION_ATTRIBS 1043 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) 1044 { 1045 __m64 ret; 1046 1047 asm("punpckhbh %0, %1, %2\n\t" 1048 : "=f" (ret) 1049 : "f" (__m1), "f" (__m2) 1050 ); 1051 1052 return ret; 1053 } 1054 1055 extern __inline __m64 FUNCTION_ATTRIBS 1056 _mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2) 1057 { 1058 __m64 ret; 1059 1060 asm("punpckhbh %0, %1, %2\n\t" 1061 : "=f" (ret) 1062 : "f" (__m1), "f" (__m2) 1063 ); 1064 1065 return ret; 1066 } 1067 1068 extern __inline __m64 FUNCTION_ATTRIBS 1069 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) 1070 { 1071 __m64 ret; 1072 1073 asm("punpckhhw %0, %1, %2\n\t" 1074 : "=f" (ret) 1075 : "f" (__m1), "f" (__m2) 1076 ); 1077 1078 return ret; 1079 } 1080 1081 extern __inline __m64 FUNCTION_ATTRIBS 1082 _mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2) 1083 { 1084 __m64 ret; 1085 1086 asm("punpckhhw %0, %1, %2\n\t" 1087 : "=f" (ret) 1088 : "f" (__m1), "f" (__m2) 1089 ); 1090 1091 return ret; 1092 } 1093 1094 extern __inline __m64 FUNCTION_ATTRIBS 1095 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) 1096 { 1097 __m64 ret; 1098 1099 asm("punpckhwd %0, %1, %2\n\t" 1100 : "=f" (ret) 1101 : "f" (__m1), "f" (__m2) 1102 ); 1103 1104 return ret; 1105 } 1106 1107 extern __inline __m64 FUNCTION_ATTRIBS 1108 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) 1109 { 1110 __m64 ret; 1111 1112 asm("punpcklbh %0, %1, %2\n\t" 1113 : "=f" (ret) 1114 : "f" (__m1), "f" (__m2) 1115 ); 1116 1117 return ret; 1118 } 1119 1120 /* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype, 1121 which preserves the data. */ 1122 1123 extern __inline __m64 FUNCTION_ATTRIBS 1124 _mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2) 1125 { 1126 __m64 ret; 1127 1128 asm("punpcklbh %0, %1, %2\n\t" 1129 : "=f" (ret) 1130 : "f" (__m1), "f" (__m2) 1131 ); 1132 1133 return ret; 1134 } 1135 1136 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32, 1137 datatype, which allows load8888 to use 32-bit loads. */ 1138 1139 extern __inline __m64 FUNCTION_ATTRIBS 1140 _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2) 1141 { 1142 __m64 ret; 1143 1144 asm("punpcklbh %0, %1, %2\n\t" 1145 : "=f" (ret) 1146 : "f" (__m1), "f" (__m2) 1147 ); 1148 1149 return ret; 1150 } 1151 1152 extern __inline __m64 FUNCTION_ATTRIBS 1153 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) 1154 { 1155 __m64 ret; 1156 1157 asm("punpcklhw %0, %1, %2\n\t" 1158 : "=f" (ret) 1159 : "f" (__m1), "f" (__m2) 1160 ); 1161 1162 return ret; 1163 } 1164 1165 extern __inline __m64 FUNCTION_ATTRIBS 1166 _mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2) 1167 { 1168 __m64 ret; 1169 1170 asm("punpcklhw %0, %1, %2\n\t" 1171 : "=f" (ret) 1172 : "f" (__m1), "f" (__m2) 1173 ); 1174 1175 return ret; 1176 } 1177 1178 extern __inline __m64 FUNCTION_ATTRIBS 1179 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) 1180 { 1181 __m64 ret; 1182 1183 asm("punpcklwd %0, %1, %2\n\t" 1184 : "=f" (ret) 1185 : "f" (__m1), "f" (__m2) 1186 ); 1187 1188 return ret; 1189 } 1190 1191 1192 extern __inline __m64 FUNCTION_ATTRIBS 1193 _mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2) 1194 { 1195 __m64 ret; 1196 1197 asm("punpcklwd %0, %1, %2\n\t" 1198 : "=f" (ret) 1199 : "f" (__m1), "f" (__m2) 1200 ); 1201 1202 return ret; 1203 } 1204 1205 extern __inline void FUNCTION_ATTRIBS 1206 _mm_store_pi32(__m32 *dest, __m64 src) 1207 { 1208 src = _mm_packs_pu16(src, _mm_setzero_si64()); 1209 1210 asm("swc1 %1, %0\n\t" 1211 : "=m" (*dest) 1212 : "f" (src) 1213 : "memory" 1214 ); 1215 } 1216 1217 extern __inline void FUNCTION_ATTRIBS 1218 _mm_store_si64(__m64 *dest, __m64 src) 1219 { 1220 asm("gssdlc1 %1, 7+%0\n\t" 1221 "gssdrc1 %1, %0\n\t" 1222 : "=m" (*dest) 1223 : "f" (src) 1224 : "memory" 1225 ); 1226 } 1227 1228 extern __inline __m64 FUNCTION_ATTRIBS 1229 _mm_load_si32(const __m32 *src) 1230 { 1231 __m32 ret; 1232 1233 asm("lwc1 %0, %1\n\t" 1234 : "=f" (ret) 1235 : "m" (*src) 1236 ); 1237 1238 return ret; 1239 } 1240 1241 extern __inline __m64 FUNCTION_ATTRIBS 1242 _mm_load_si64(const __m64 *src) 1243 { 1244 __m64 ret; 1245 1246 asm("ldc1 %0, %1\n\t" 1247 : "=f" (ret) 1248 : "m" (*src) 1249 : "memory" 1250 ); 1251 1252 return ret; 1253 } 1254 1255 extern __inline __m64 FUNCTION_ATTRIBS 1256 _mm_loadu_si64(const __m64 *src) 1257 { 1258 __m64 ret; 1259 1260 asm("gsldlc1 %0, 7(%1)\n\t" 1261 "gsldrc1 %0, 0(%1)\n\t" 1262 : "=f" (ret) 1263 : "r" (src) 1264 : "memory" 1265 ); 1266 1267 return ret; 1268 } 1269 1270 extern __inline __m64 FUNCTION_ATTRIBS 1271 _mm_loadlo_pi8(const uint32_t *src) 1272 { 1273 return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64()); 1274 } 1275 1276 extern __inline __m64 FUNCTION_ATTRIBS 1277 _mm_loadlo_pi8_f(__m64 src) 1278 { 1279 return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64()); 1280 } 1281 1282 extern __inline __m64 FUNCTION_ATTRIBS 1283 _mm_loadhi_pi8_f(__m64 src) 1284 { 1285 return _mm_unpackhi_pi8_f(src, _mm_setzero_si64()); 1286 } 1287 1288 extern __inline __m64 FUNCTION_ATTRIBS 1289 _mm_loadlo_pi16(__m64 src) 1290 { 1291 return _mm_unpacklo_pi16(src, _mm_setzero_si64()); 1292 } 1293 1294 extern __inline __m64 FUNCTION_ATTRIBS 1295 _mm_loadlo_pi16_f(__m64 src) 1296 { 1297 return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src); 1298 } 1299 1300 extern __inline __m64 FUNCTION_ATTRIBS 1301 _mm_loadhi_pi16(__m64 src) 1302 { 1303 return _mm_unpackhi_pi16(src, _mm_setzero_si64()); 1304 } 1305 1306 extern __inline __m64 FUNCTION_ATTRIBS 1307 _mm_loadhi_pi16_f(__m64 src) 1308 { 1309 return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src); 1310 } 1311 1312 extern __inline __m64 FUNCTION_ATTRIBS 1313 _mm_expand_alpha(__m64 pixel) 1314 { 1315 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3)); 1316 } 1317 1318 extern __inline __m64 FUNCTION_ATTRIBS 1319 _mm_expand_alpha_rev(__m64 pixel) 1320 { 1321 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0)); 1322 } 1323 1324 #endif /* __LOONGSON_MMINTRIN_H__ */ 1325