1 ; 2 ; x86 format converters for HERMES 3 ; Some routines Copyright (c) 1998 Christian Nentwich (brn (a] eleet.mcb.at) 4 ; This source code is licensed under the GNU LGPL 5 ; 6 ; Please refer to the file COPYING.LIB contained in the distribution for 7 ; licensing conditions 8 ; 9 ; Most routines are (c) Glenn Fiedler (ptc (a] gaffer.org), used with permission 10 ; 11 12 BITS 32 13 14 %include "common.inc" 15 16 SDL_FUNC _ConvertX86p32_32BGR888 17 SDL_FUNC _ConvertX86p32_32RGBA888 18 SDL_FUNC _ConvertX86p32_32BGRA888 19 SDL_FUNC _ConvertX86p32_24RGB888 20 SDL_FUNC _ConvertX86p32_24BGR888 21 SDL_FUNC _ConvertX86p32_16RGB565 22 SDL_FUNC _ConvertX86p32_16BGR565 23 SDL_FUNC _ConvertX86p32_16RGB555 24 SDL_FUNC _ConvertX86p32_16BGR555 25 SDL_FUNC _ConvertX86p32_8RGB332 26 27 SECTION .text 28 29 ;; _Convert_* 30 ;; Paramters: 31 ;; ESI = source 32 ;; EDI = dest 33 ;; ECX = amount (NOT 0!!! (the _ConvertX86 routine checks for that though)) 34 ;; Destroys: 35 ;; EAX, EBX, EDX 36 37 38 _ConvertX86p32_32BGR888: 39 40 ; check short 41 cmp ecx,BYTE 32 42 ja .L3 43 44 .L1: ; short loop 45 mov edx,[esi] 46 bswap edx 47 ror edx,8 48 mov [edi],edx 49 add esi,BYTE 4 50 add edi,BYTE 4 51 dec ecx 52 jnz .L1 53 .L2: 54 retn 55 56 .L3: ; save ebp 57 push ebp 58 59 ; unroll four times 60 mov ebp,ecx 61 shr ebp,2 62 63 ; save count 64 push ecx 65 66 .L4: mov eax,[esi] 67 mov ebx,[esi+4] 68 69 bswap eax 70 71 bswap ebx 72 73 ror eax,8 74 mov ecx,[esi+8] 75 76 ror ebx,8 77 mov edx,[esi+12] 78 79 bswap ecx 80 81 bswap edx 82 83 ror ecx,8 84 mov [edi+0],eax 85 86 ror edx,8 87 mov [edi+4],ebx 88 89 mov [edi+8],ecx 90 mov [edi+12],edx 91 92 add esi,BYTE 16 93 add edi,BYTE 16 94 95 dec ebp 96 jnz .L4 97 98 ; check tail 99 pop ecx 100 and ecx,BYTE 11b 101 jz .L6 102 103 .L5: ; tail loop 104 mov edx,[esi] 105 bswap edx 106 ror edx,8 107 mov [edi],edx 108 add esi,BYTE 4 109 add edi,BYTE 4 110 dec ecx 111 jnz .L5 112 113 .L6: pop ebp 114 retn 115 116 117 118 119 _ConvertX86p32_32RGBA888: 120 121 ; check short 122 cmp ecx,BYTE 32 123 ja .L3 124 125 .L1: ; short loop 126 mov edx,[esi] 127 rol edx,8 128 mov [edi],edx 129 add esi,BYTE 4 130 add edi,BYTE 4 131 dec ecx 132 jnz .L1 133 .L2: 134 retn 135 136 .L3: ; save ebp 137 push ebp 138 139 ; unroll four times 140 mov ebp,ecx 141 shr ebp,2 142 143 ; save count 144 push ecx 145 146 .L4: mov eax,[esi] 147 mov ebx,[esi+4] 148 149 rol eax,8 150 mov ecx,[esi+8] 151 152 rol ebx,8 153 mov edx,[esi+12] 154 155 rol ecx,8 156 mov [edi+0],eax 157 158 rol edx,8 159 mov [edi+4],ebx 160 161 mov [edi+8],ecx 162 mov [edi+12],edx 163 164 add esi,BYTE 16 165 add edi,BYTE 16 166 167 dec ebp 168 jnz .L4 169 170 ; check tail 171 pop ecx 172 and ecx,BYTE 11b 173 jz .L6 174 175 .L5: ; tail loop 176 mov edx,[esi] 177 rol edx,8 178 mov [edi],edx 179 add esi,BYTE 4 180 add edi,BYTE 4 181 dec ecx 182 jnz .L5 183 184 .L6: pop ebp 185 retn 186 187 188 189 190 _ConvertX86p32_32BGRA888: 191 192 ; check short 193 cmp ecx,BYTE 32 194 ja .L3 195 196 .L1: ; short loop 197 mov edx,[esi] 198 bswap edx 199 mov [edi],edx 200 add esi,BYTE 4 201 add edi,BYTE 4 202 dec ecx 203 jnz .L1 204 .L2: 205 retn 206 207 .L3: ; save ebp 208 push ebp 209 210 ; unroll four times 211 mov ebp,ecx 212 shr ebp,2 213 214 ; save count 215 push ecx 216 217 .L4: mov eax,[esi] 218 mov ebx,[esi+4] 219 220 mov ecx,[esi+8] 221 mov edx,[esi+12] 222 223 bswap eax 224 225 bswap ebx 226 227 bswap ecx 228 229 bswap edx 230 231 mov [edi+0],eax 232 mov [edi+4],ebx 233 234 mov [edi+8],ecx 235 mov [edi+12],edx 236 237 add esi,BYTE 16 238 add edi,BYTE 16 239 240 dec ebp 241 jnz .L4 242 243 ; check tail 244 pop ecx 245 and ecx,BYTE 11b 246 jz .L6 247 248 .L5: ; tail loop 249 mov edx,[esi] 250 bswap edx 251 mov [edi],edx 252 add esi,BYTE 4 253 add edi,BYTE 4 254 dec ecx 255 jnz .L5 256 257 .L6: pop ebp 258 retn 259 260 261 262 263 ;; 32 bit RGB 888 to 24 BIT RGB 888 264 265 _ConvertX86p32_24RGB888: 266 267 ; check short 268 cmp ecx,BYTE 32 269 ja .L3 270 271 .L1: ; short loop 272 mov al,[esi] 273 mov bl,[esi+1] 274 mov dl,[esi+2] 275 mov [edi],al 276 mov [edi+1],bl 277 mov [edi+2],dl 278 add esi,BYTE 4 279 add edi,BYTE 3 280 dec ecx 281 jnz .L1 282 .L2: 283 retn 284 285 .L3: ; head 286 mov edx,edi 287 and edx,BYTE 11b 288 jz .L4 289 mov al,[esi] 290 mov bl,[esi+1] 291 mov dl,[esi+2] 292 mov [edi],al 293 mov [edi+1],bl 294 mov [edi+2],dl 295 add esi,BYTE 4 296 add edi,BYTE 3 297 dec ecx 298 jmp SHORT .L3 299 300 .L4: ; unroll 4 times 301 push ebp 302 mov ebp,ecx 303 shr ebp,2 304 305 ; save count 306 push ecx 307 308 .L5: mov eax,[esi] ; first dword eax = [A][R][G][B] 309 mov ebx,[esi+4] ; second dword ebx = [a][r][g][b] 310 311 shl eax,8 ; eax = [R][G][B][.] 312 mov ecx,[esi+12] ; third dword ecx = [a][r][g][b] 313 314 shl ebx,8 ; ebx = [r][g][b][.] 315 mov al,[esi+4] ; eax = [R][G][B][b] 316 317 ror eax,8 ; eax = [b][R][G][B] (done) 318 mov bh,[esi+8+1] ; ebx = [r][g][G][.] 319 320 mov [edi],eax 321 add edi,BYTE 3*4 322 323 shl ecx,8 ; ecx = [r][g][b][.] 324 mov bl,[esi+8+0] ; ebx = [r][g][G][B] 325 326 rol ebx,16 ; ebx = [G][B][r][g] (done) 327 mov cl,[esi+8+2] ; ecx = [r][g][b][R] (done) 328 329 mov [edi+4-3*4],ebx 330 add esi,BYTE 4*4 331 332 mov [edi+8-3*4],ecx 333 dec ebp 334 335 jnz .L5 336 337 ; check tail 338 pop ecx 339 and ecx,BYTE 11b 340 jz .L7 341 342 .L6: ; tail loop 343 mov al,[esi] 344 mov bl,[esi+1] 345 mov dl,[esi+2] 346 mov [edi],al 347 mov [edi+1],bl 348 mov [edi+2],dl 349 add esi,BYTE 4 350 add edi,BYTE 3 351 dec ecx 352 jnz .L6 353 354 .L7: pop ebp 355 retn 356 357 358 359 360 ;; 32 bit RGB 888 to 24 bit BGR 888 361 362 _ConvertX86p32_24BGR888: 363 364 ; check short 365 cmp ecx,BYTE 32 366 ja .L3 367 368 .L1: ; short loop 369 mov dl,[esi] 370 mov bl,[esi+1] 371 mov al,[esi+2] 372 mov [edi],al 373 mov [edi+1],bl 374 mov [edi+2],dl 375 add esi,BYTE 4 376 add edi,BYTE 3 377 dec ecx 378 jnz .L1 379 .L2: 380 retn 381 382 .L3: ; head 383 mov edx,edi 384 and edx,BYTE 11b 385 jz .L4 386 mov dl,[esi] 387 mov bl,[esi+1] 388 mov al,[esi+2] 389 mov [edi],al 390 mov [edi+1],bl 391 mov [edi+2],dl 392 add esi,BYTE 4 393 add edi,BYTE 3 394 dec ecx 395 jmp SHORT .L3 396 397 .L4: ; unroll 4 times 398 push ebp 399 mov ebp,ecx 400 shr ebp,2 401 402 ; save count 403 push ecx 404 405 .L5: 406 mov eax,[esi] ; first dword eax = [A][R][G][B] 407 mov ebx,[esi+4] ; second dword ebx = [a][r][g][b] 408 409 bswap eax ; eax = [B][G][R][A] 410 411 bswap ebx ; ebx = [b][g][r][a] 412 413 mov al,[esi+4+2] ; eax = [B][G][R][r] 414 mov bh,[esi+4+4+1] ; ebx = [b][g][G][a] 415 416 ror eax,8 ; eax = [r][B][G][R] (done) 417 mov bl,[esi+4+4+2] ; ebx = [b][g][G][R] 418 419 ror ebx,16 ; ebx = [G][R][b][g] (done) 420 mov [edi],eax 421 422 mov [edi+4],ebx 423 mov ecx,[esi+12] ; third dword ecx = [a][r][g][b] 424 425 bswap ecx ; ecx = [b][g][r][a] 426 427 mov cl,[esi+8] ; ecx = [b][g][r][B] (done) 428 add esi,BYTE 4*4 429 430 mov [edi+8],ecx 431 add edi,BYTE 3*4 432 433 dec ebp 434 jnz .L5 435 436 ; check tail 437 pop ecx 438 and ecx,BYTE 11b 439 jz .L7 440 441 .L6: ; tail loop 442 mov dl,[esi] 443 mov bl,[esi+1] 444 mov al,[esi+2] 445 mov [edi],al 446 mov [edi+1],bl 447 mov [edi+2],dl 448 add esi,BYTE 4 449 add edi,BYTE 3 450 dec ecx 451 jnz .L6 452 453 .L7: 454 pop ebp 455 retn 456 457 458 459 460 ;; 32 bit RGB 888 to 16 BIT RGB 565 461 462 _ConvertX86p32_16RGB565: 463 ; check short 464 cmp ecx,BYTE 16 465 ja .L3 466 467 .L1: ; short loop 468 mov bl,[esi+0] ; blue 469 mov al,[esi+1] ; green 470 mov ah,[esi+2] ; red 471 shr ah,3 472 and al,11111100b 473 shl eax,3 474 shr bl,3 475 add al,bl 476 mov [edi+0],al 477 mov [edi+1],ah 478 add esi,BYTE 4 479 add edi,BYTE 2 480 dec ecx 481 jnz .L1 482 483 .L2: ; End of short loop 484 retn 485 486 487 .L3: ; head 488 mov ebx,edi 489 and ebx,BYTE 11b 490 jz .L4 491 492 mov bl,[esi+0] ; blue 493 mov al,[esi+1] ; green 494 mov ah,[esi+2] ; red 495 shr ah,3 496 and al,11111100b 497 shl eax,3 498 shr bl,3 499 add al,bl 500 mov [edi+0],al 501 mov [edi+1],ah 502 add esi,BYTE 4 503 add edi,BYTE 2 504 dec ecx 505 506 .L4: 507 ; save count 508 push ecx 509 510 ; unroll twice 511 shr ecx,1 512 513 ; point arrays to end 514 lea esi,[esi+ecx*8] 515 lea edi,[edi+ecx*4] 516 517 ; negative counter 518 neg ecx 519 jmp SHORT .L6 520 521 .L5: 522 mov [edi+ecx*4-4],eax 523 .L6: 524 mov eax,[esi+ecx*8] 525 526 shr ah,2 527 mov ebx,[esi+ecx*8+4] 528 529 shr eax,3 530 mov edx,[esi+ecx*8+4] 531 532 shr bh,2 533 mov dl,[esi+ecx*8+2] 534 535 shl ebx,13 536 and eax,000007FFh 537 538 shl edx,8 539 and ebx,07FF0000h 540 541 and edx,0F800F800h 542 add eax,ebx 543 544 add eax,edx 545 inc ecx 546 547 jnz .L5 548 549 mov [edi+ecx*4-4],eax 550 551 ; tail 552 pop ecx 553 test cl,1 554 jz .L7 555 556 mov bl,[esi+0] ; blue 557 mov al,[esi+1] ; green 558 mov ah,[esi+2] ; red 559 shr ah,3 560 and al,11111100b 561 shl eax,3 562 shr bl,3 563 add al,bl 564 mov [edi+0],al 565 mov [edi+1],ah 566 add esi,BYTE 4 567 add edi,BYTE 2 568 569 .L7: 570 retn 571 572 573 574 575 ;; 32 bit RGB 888 to 16 BIT BGR 565 576 577 _ConvertX86p32_16BGR565: 578 579 ; check short 580 cmp ecx,BYTE 16 581 ja .L3 582 583 .L1: ; short loop 584 mov ah,[esi+0] ; blue 585 mov al,[esi+1] ; green 586 mov bl,[esi+2] ; red 587 shr ah,3 588 and al,11111100b 589 shl eax,3 590 shr bl,3 591 add al,bl 592 mov [edi+0],al 593 mov [edi+1],ah 594 add esi,BYTE 4 595 add edi,BYTE 2 596 dec ecx 597 jnz .L1 598 .L2: 599 retn 600 601 .L3: ; head 602 mov ebx,edi 603 and ebx,BYTE 11b 604 jz .L4 605 mov ah,[esi+0] ; blue 606 mov al,[esi+1] ; green 607 mov bl,[esi+2] ; red 608 shr ah,3 609 and al,11111100b 610 shl eax,3 611 shr bl,3 612 add al,bl 613 mov [edi+0],al 614 mov [edi+1],ah 615 add esi,BYTE 4 616 add edi,BYTE 2 617 dec ecx 618 619 .L4: ; save count 620 push ecx 621 622 ; unroll twice 623 shr ecx,1 624 625 ; point arrays to end 626 lea esi,[esi+ecx*8] 627 lea edi,[edi+ecx*4] 628 629 ; negative count 630 neg ecx 631 jmp SHORT .L6 632 633 .L5: 634 mov [edi+ecx*4-4],eax 635 .L6: 636 mov edx,[esi+ecx*8+4] 637 638 mov bh,[esi+ecx*8+4] 639 mov ah,[esi+ecx*8] 640 641 shr bh,3 642 mov al,[esi+ecx*8+1] 643 644 shr ah,3 645 mov bl,[esi+ecx*8+5] 646 647 shl eax,3 648 mov dl,[esi+ecx*8+2] 649 650 shl ebx,19 651 and eax,0000FFE0h 652 653 shr edx,3 654 and ebx,0FFE00000h 655 656 and edx,001F001Fh 657 add eax,ebx 658 659 add eax,edx 660 inc ecx 661 662 jnz .L5 663 664 mov [edi+ecx*4-4],eax 665 666 ; tail 667 pop ecx 668 and ecx,BYTE 1 669 jz .L7 670 mov ah,[esi+0] ; blue 671 mov al,[esi+1] ; green 672 mov bl,[esi+2] ; red 673 shr ah,3 674 and al,11111100b 675 shl eax,3 676 shr bl,3 677 add al,bl 678 mov [edi+0],al 679 mov [edi+1],ah 680 add esi,BYTE 4 681 add edi,BYTE 2 682 683 .L7: 684 retn 685 686 687 688 689 ;; 32 BIT RGB TO 16 BIT RGB 555 690 691 _ConvertX86p32_16RGB555: 692 693 ; check short 694 cmp ecx,BYTE 16 695 ja .L3 696 697 .L1: ; short loop 698 mov bl,[esi+0] ; blue 699 mov al,[esi+1] ; green 700 mov ah,[esi+2] ; red 701 shr ah,3 702 and al,11111000b 703 shl eax,2 704 shr bl,3 705 add al,bl 706 mov [edi+0],al 707 mov [edi+1],ah 708 add esi,BYTE 4 709 add edi,BYTE 2 710 dec ecx 711 jnz .L1 712 .L2: 713 retn 714 715 .L3: ; head 716 mov ebx,edi 717 and ebx,BYTE 11b 718 jz .L4 719 mov bl,[esi+0] ; blue 720 mov al,[esi+1] ; green 721 mov ah,[esi+2] ; red 722 shr ah,3 723 and al,11111000b 724 shl eax,2 725 shr bl,3 726 add al,bl 727 mov [edi+0],al 728 mov [edi+1],ah 729 add esi,BYTE 4 730 add edi,BYTE 2 731 dec ecx 732 733 .L4: ; save count 734 push ecx 735 736 ; unroll twice 737 shr ecx,1 738 739 ; point arrays to end 740 lea esi,[esi+ecx*8] 741 lea edi,[edi+ecx*4] 742 743 ; negative counter 744 neg ecx 745 jmp SHORT .L6 746 747 .L5: 748 mov [edi+ecx*4-4],eax 749 .L6: 750 mov eax,[esi+ecx*8] 751 752 shr ah,3 753 mov ebx,[esi+ecx*8+4] 754 755 shr eax,3 756 mov edx,[esi+ecx*8+4] 757 758 shr bh,3 759 mov dl,[esi+ecx*8+2] 760 761 shl ebx,13 762 and eax,000007FFh 763 764 shl edx,7 765 and ebx,07FF0000h 766 767 and edx,07C007C00h 768 add eax,ebx 769 770 add eax,edx 771 inc ecx 772 773 jnz .L5 774 775 mov [edi+ecx*4-4],eax 776 777 ; tail 778 pop ecx 779 and ecx,BYTE 1 780 jz .L7 781 mov bl,[esi+0] ; blue 782 mov al,[esi+1] ; green 783 mov ah,[esi+2] ; red 784 shr ah,3 785 and al,11111000b 786 shl eax,2 787 shr bl,3 788 add al,bl 789 mov [edi+0],al 790 mov [edi+1],ah 791 add esi,BYTE 4 792 add edi,BYTE 2 793 794 .L7: 795 retn 796 797 798 799 800 ;; 32 BIT RGB TO 16 BIT BGR 555 801 802 _ConvertX86p32_16BGR555: 803 804 ; check short 805 cmp ecx,BYTE 16 806 ja .L3 807 808 809 .L1: ; short loop 810 mov ah,[esi+0] ; blue 811 mov al,[esi+1] ; green 812 mov bl,[esi+2] ; red 813 shr ah,3 814 and al,11111000b 815 shl eax,2 816 shr bl,3 817 add al,bl 818 mov [edi+0],al 819 mov [edi+1],ah 820 add esi,BYTE 4 821 add edi,BYTE 2 822 dec ecx 823 jnz .L1 824 .L2: 825 retn 826 827 .L3: ; head 828 mov ebx,edi 829 and ebx,BYTE 11b 830 jz .L4 831 mov ah,[esi+0] ; blue 832 mov al,[esi+1] ; green 833 mov bl,[esi+2] ; red 834 shr ah,3 835 and al,11111000b 836 shl eax,2 837 shr bl,3 838 add al,bl 839 mov [edi+0],al 840 mov [edi+1],ah 841 add esi,BYTE 4 842 add edi,BYTE 2 843 dec ecx 844 845 .L4: ; save count 846 push ecx 847 848 ; unroll twice 849 shr ecx,1 850 851 ; point arrays to end 852 lea esi,[esi+ecx*8] 853 lea edi,[edi+ecx*4] 854 855 ; negative counter 856 neg ecx 857 jmp SHORT .L6 858 859 .L5: 860 mov [edi+ecx*4-4],eax 861 .L6: 862 mov edx,[esi+ecx*8+4] 863 864 mov bh,[esi+ecx*8+4] 865 mov ah,[esi+ecx*8] 866 867 shr bh,3 868 mov al,[esi+ecx*8+1] 869 870 shr ah,3 871 mov bl,[esi+ecx*8+5] 872 873 shl eax,2 874 mov dl,[esi+ecx*8+2] 875 876 shl ebx,18 877 and eax,00007FE0h 878 879 shr edx,3 880 and ebx,07FE00000h 881 882 and edx,001F001Fh 883 add eax,ebx 884 885 add eax,edx 886 inc ecx 887 888 jnz .L5 889 890 mov [edi+ecx*4-4],eax 891 892 ; tail 893 pop ecx 894 and ecx,BYTE 1 895 jz .L7 896 mov ah,[esi+0] ; blue 897 mov al,[esi+1] ; green 898 mov bl,[esi+2] ; red 899 shr ah,3 900 and al,11111000b 901 shl eax,2 902 shr bl,3 903 add al,bl 904 mov [edi+0],al 905 mov [edi+1],ah 906 add esi,BYTE 4 907 add edi,BYTE 2 908 909 .L7: 910 retn 911 912 913 914 915 916 ;; FROM 32 BIT RGB to 8 BIT RGB (rrrgggbbb) 917 ;; This routine writes FOUR pixels at once (dword) and then, if they exist 918 ;; the trailing three pixels 919 _ConvertX86p32_8RGB332: 920 921 922 .L_ALIGNED: 923 push ecx 924 925 shr ecx,2 ; We will draw 4 pixels at once 926 jnz .L1 927 928 jmp .L2 ; short jump out of range :( 929 930 .L1: 931 mov eax,[esi] ; first pair of pixels 932 mov edx,[esi+4] 933 934 shr dl,6 935 mov ebx,eax 936 937 shr al,6 938 and ah,0e0h 939 940 shr ebx,16 941 and dh,0e0h 942 943 shr ah,3 944 and bl,0e0h 945 946 shr dh,3 947 948 or al,bl 949 950 mov ebx,edx 951 or al,ah 952 953 shr ebx,16 954 or dl,dh 955 956 and bl,0e0h 957 958 or dl,bl 959 960 mov ah,dl 961 962 963 964 mov ebx,[esi+8] ; second pair of pixels 965 966 mov edx,ebx 967 and bh,0e0h 968 969 shr bl,6 970 and edx,0e00000h 971 972 shr edx,16 973 974 shr bh,3 975 976 ror eax,16 977 or bl,dl 978 979 mov edx,[esi+12] 980 or bl,bh 981 982 mov al,bl 983 984 mov ebx,edx 985 and dh,0e0h 986 987 shr dl,6 988 and ebx,0e00000h 989 990 shr dh,3 991 mov ah,dl 992 993 shr ebx,16 994 or ah,dh 995 996 or ah,bl 997 998 rol eax,16 999 add esi,BYTE 16 1000 1001 mov [edi],eax 1002 add edi,BYTE 4 1003 1004 dec ecx 1005 jz .L2 ; L1 out of range for short jump :( 1006 1007 jmp .L1 1008 .L2: 1009 1010 pop ecx 1011 and ecx,BYTE 3 ; mask out number of pixels to draw 1012 1013 jz .L4 ; Nothing to do anymore 1014 1015 .L3: 1016 mov eax,[esi] ; single pixel conversion for trailing pixels 1017 1018 mov ebx,eax 1019 1020 shr al,6 1021 and ah,0e0h 1022 1023 shr ebx,16 1024 1025 shr ah,3 1026 and bl,0e0h 1027 1028 or al,ah 1029 or al,bl 1030 1031 mov [edi],al 1032 1033 inc edi 1034 add esi,BYTE 4 1035 1036 dec ecx 1037 jnz .L3 1038 1039 .L4: 1040 retn 1041 1042 %ifidn __OUTPUT_FORMAT__,elf32 1043 section .note.GNU-stack noalloc noexec nowrite progbits 1044 %endif 1045