Home | History | Annotate | Download | only in hermes
      1 ;
      2 ; pII-optimised MMX format converters for HERMES
      3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich (a] cs.ucl.ac.uk)
      4 ;   and (c) 1999 Jonathan Matthew (jmatthew (a] uq.net.au)
      5 ; This source code is licensed under the GNU LGPL
      6 ; 
      7 ; Please refer to the file COPYING.LIB contained in the distribution for
      8 ; licensing conditions		
      9 ;
     10 ; COPYRIGHT NOTICE
     11 ; 
     12 ; This file partly contains code that is (c) Intel Corporation, specifically
     13 ; the mode detection routine, and the converter to 15 bit (8 pixel
     14 ; conversion routine from the mmx programming tutorial pages).
     15 ;
     16 ;
     17 ; These routines aren't exactly pII optimised - it's just that as they
     18 ; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
     19 ; optimise them for p5 MMXs..
     20 
     21 BITS 32
     22 
     23 %include "common.inc"
     24 	
     25 SDL_FUNC _ConvertMMXpII32_24RGB888
     26 SDL_FUNC _ConvertMMXpII32_16RGB565
     27 SDL_FUNC _ConvertMMXpII32_16BGR565
     28 SDL_FUNC _ConvertMMXpII32_16RGB555
     29 SDL_FUNC _ConvertMMXpII32_16BGR555
     30 
     31 ;; Macros for conversion routines
     32 
     33 %macro _push_immq_mask 1
     34 	push dword %1
     35 	push dword %1
     36 %endmacro
     37 
     38 %macro load_immq 2
     39 	_push_immq_mask %2
     40 	movq %1, [esp]
     41 %endmacro
     42 
     43 %macro pand_immq 2
     44 	_push_immq_mask %2
     45 	pand %1, [esp]
     46 %endmacro
     47 
     48 %define CLEANUP_IMMQ_LOADS(num) \
     49 	add esp, byte 8 * num
     50 
     51 %define mmx32_rgb888_mask 00ffffffh
     52 %define mmx32_rgb565_b 000000f8h
     53 %define mmx32_rgb565_g 0000fc00h
     54 %define mmx32_rgb565_r 00f80000h
     55 
     56 %define mmx32_rgb555_rb 00f800f8h
     57 %define mmx32_rgb555_g 0000f800h
     58 %define mmx32_rgb555_mul 20000008h
     59 %define mmx32_bgr555_mul 00082000h
     60 
     61 SECTION .text
     62 
     63 _ConvertMMXpII32_24RGB888:
     64 
     65         ; set up mm6 as the mask, mm7 as zero
     66         load_immq mm6, mmx32_rgb888_mask
     67         CLEANUP_IMMQ_LOADS(1)
     68         pxor mm7, mm7
     69 
     70         mov edx, ecx                    ; save ecx
     71         and ecx, 0fffffffch             ; clear lower two bits
     72         jnz .L1
     73         jmp .L2
     74 
     75 .L1:
     76 
     77         movq mm0, [esi]                 ; A R G B a r g b
     78         pand mm0, mm6                   ; 0 R G B 0 r g b
     79         movq mm1, [esi+8]               ; A R G B a r g b
     80         pand mm1, mm6                   ; 0 R G B 0 r g b
     81 
     82         movq mm2, mm0                   ; 0 R G B 0 r g b
     83         punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
     84         punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
     85         psllq mm2, 24                   ; 0 0 R G B 0 0 0
     86         por mm0, mm2                    ; 0 0 R G B r g b
     87 
     88         movq mm3, mm1                   ; 0 R G B 0 r g b
     89         psllq mm3, 48                   ; g b 0 0 0 0 0 0
     90         por mm0, mm3                    ; g b R G B r g b
     91 
     92         movq mm4, mm1                   ; 0 R G B 0 r g b
     93         punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
     94         punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
     95         psrlq mm1, 16                   ; 0 0 0 R G B 0 r
     96         psllq mm4, 8                    ; 0 0 0 0 R G B 0
     97         por mm1, mm4                    ; 0 0 0 0 R G B r
     98 
     99         movq [edi], mm0
    100         add esi, BYTE 16
    101         movd [edi+8], mm1
    102         add edi, BYTE 12
    103         sub ecx, BYTE 4
    104         jnz .L1
    105 
    106 .L2:
    107         mov ecx, edx
    108         and ecx, BYTE 3
    109         jz .L4
    110 .L3:
    111         mov al, [esi]
    112         mov bl, [esi+1]
    113         mov dl, [esi+2]
    114         mov [edi], al
    115         mov [edi+1], bl
    116         mov [edi+2], dl
    117         add esi, BYTE 4
    118         add edi, BYTE 3
    119         dec ecx
    120         jnz .L3
    121 .L4:
    122         retn
    123 
    124 
    125 
    126 _ConvertMMXpII32_16RGB565:
    127 
    128         ; set up masks
    129         load_immq mm5, mmx32_rgb565_b
    130         load_immq mm6, mmx32_rgb565_g
    131         load_immq mm7, mmx32_rgb565_r
    132         CLEANUP_IMMQ_LOADS(3)
    133 
    134         mov edx, ecx
    135         shr ecx, 2
    136         jnz .L1
    137         jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
    138 
    139 .L1:
    140         movq mm0, [esi]         ; argb
    141         movq mm1, mm0           ; argb
    142         pand mm0, mm6           ; 00g0
    143         movq mm3, mm1           ; argb
    144         pand mm1, mm5           ; 000b
    145         pand mm3, mm7           ; 0r00
    146         pslld mm1, 2            ; 0 0 000000bb bbb00000
    147         por mm0, mm1            ; 0 0 ggggggbb bbb00000
    148         psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
    149 
    150         movq mm4, [esi+8]       ; argb
    151         movq mm2, mm4           ; argb
    152         pand mm4, mm6           ; 00g0
    153         movq mm1, mm2           ; argb
    154         pand mm2, mm5           ; 000b
    155         pand mm1, mm7           ; 0r00
    156         pslld mm2, 2            ; 0 0 000000bb bbb00000
    157         por mm4, mm2            ; 0 0 ggggggbb bbb00000
    158         psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
    159 
    160         packuswb mm3, mm1       ; R 0 r 0
    161         packssdw mm0, mm4       ; as above.. ish
    162         por mm0, mm3            ; done.
    163         movq [edi], mm0
    164 
    165         add esi, 16
    166         add edi, 8
    167         dec ecx
    168         jnz .L1
    169 
    170 .L2:
    171         mov ecx, edx
    172         and ecx, BYTE 3
    173         jz .L4
    174 .L3:
    175         mov al, [esi]
    176         mov bh, [esi+1]
    177         mov ah, [esi+2]
    178         shr al, 3
    179         and eax, 0F81Fh            ; BYTE?
    180         shr ebx, 5
    181         and ebx, 07E0h             ; BYTE?
    182         add eax, ebx
    183         mov [edi], al
    184         mov [edi+1], ah
    185         add esi, BYTE 4
    186         add edi, BYTE 2
    187         dec ecx
    188         jnz .L3
    189 
    190 .L4:
    191 	retn
    192 
    193 	
    194 _ConvertMMXpII32_16BGR565:
    195 
    196         load_immq mm5, mmx32_rgb565_r
    197         load_immq mm6, mmx32_rgb565_g
    198         load_immq mm7, mmx32_rgb565_b
    199         CLEANUP_IMMQ_LOADS(3)
    200 
    201         mov edx, ecx
    202         shr ecx, 2
    203         jnz .L1
    204         jmp .L2
    205 
    206 .L1:
    207         movq mm0, [esi]                 ; a r g b
    208         movq mm1, mm0                   ; a r g b
    209         pand mm0, mm6                   ; 0 0 g 0
    210         movq mm3, mm1                   ; a r g b
    211         pand mm1, mm5                   ; 0 r 0 0
    212         pand mm3, mm7                   ; 0 0 0 b
    213 
    214         psllq mm3, 16                   ; 0 b 0 0
    215         psrld mm1, 14                   ; 0 0 000000rr rrr00000
    216         por mm0, mm1                    ; 0 0 ggggggrr rrr00000
    217         psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
    218 
    219         movq mm4, [esi+8]               ; a r g b
    220         movq mm2, mm4                   ; a r g b
    221         pand mm4, mm6                   ; 0 0 g 0
    222         movq mm1, mm2                   ; a r g b
    223         pand mm2, mm5                   ; 0 r 0 0
    224         pand mm1, mm7                   ; 0 0 0 b
    225 
    226         psllq mm1, 16                   ; 0 b 0 0
    227         psrld mm2, 14                   ; 0 0 000000rr rrr00000
    228         por mm4, mm2                    ; 0 0 ggggggrr rrr00000
    229         psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
    230 
    231         packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
    232         packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
    233         por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
    234         movq [edi], mm0
    235 
    236         add esi, BYTE 16
    237         add edi, BYTE 8
    238         dec ecx
    239         jnz .L1
    240 
    241 .L2:
    242         and edx, BYTE 3
    243         jz .L4
    244 .L3:
    245         mov al, [esi+2]
    246         mov bh, [esi+1]
    247         mov ah, [esi]
    248         shr al, 3
    249         and eax, 0F81Fh                    ; BYTE ?
    250         shr ebx, 5
    251         and ebx, 07E0h                     ; BYTE ?
    252         add eax, ebx
    253         mov [edi], al
    254         mov [edi+1], ah
    255         add esi, BYTE 4
    256         add edi, BYTE 2
    257         dec edx
    258         jnz .L3
    259 
    260 .L4:
    261         retn
    262 
    263 _ConvertMMXpII32_16BGR555:
    264 
    265         ; the 16BGR555 converter is identical to the RGB555 one,
    266         ; except it uses a different multiplier for the pmaddwd
    267         ; instruction.  cool huh.
    268 
    269         load_immq mm7, mmx32_bgr555_mul
    270         jmp _convert_bgr555_cheat
    271 
    272 ; This is the same as the Intel version.. they obviously went to
    273 ; much more trouble to expand/coil the loop than I did, so theirs
    274 ; would almost certainly be faster, even if only a little.
    275 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
    276 ; (I think) a more accurate name..
    277 _ConvertMMXpII32_16RGB555:
    278 
    279 	load_immq mm7, mmx32_rgb555_mul
    280 _convert_bgr555_cheat:
    281 	load_immq mm6, mmx32_rgb555_g
    282 	CLEANUP_IMMQ_LOADS(2)
    283         
    284 	mov edx,ecx		           ; Save ecx 
    285 
    286         and ecx,DWORD 0fffffff8h            ; clear lower three bits
    287 	jnz .L_OK
    288         jmp near .L2 
    289 
    290 .L_OK:
    291 	
    292 	movq mm2,[esi+8]
    293 
    294 	movq mm0,[esi]
    295 	movq mm3,mm2
    296 
    297 	pand_immq mm3, mmx32_rgb555_rb
    298 	movq mm1,mm0
    299 
    300 	pand_immq mm1, mmx32_rgb555_rb
    301 	pmaddwd mm3,mm7
    302 
    303 	CLEANUP_IMMQ_LOADS(2)
    304 
    305 	pmaddwd mm1,mm7
    306 	pand mm2,mm6
    307 
    308 .L1:
    309 	movq mm4,[esi+24]
    310 	pand mm0,mm6
    311 
    312 	movq mm5,[esi+16]
    313 	por mm3,mm2
    314 
    315 	psrld mm3,6
    316 	por mm1,mm0
    317 
    318 	movq mm0,mm4
    319 	psrld mm1,6
    320 
    321 	pand_immq mm0, mmx32_rgb555_rb
    322 	packssdw mm1,mm3
    323 
    324 	movq mm3,mm5
    325 	pmaddwd mm0,mm7
    326 
    327 	pand_immq mm3, mmx32_rgb555_rb
    328 	pand mm4,mm6
    329 
    330 	movq [edi],mm1			
    331 	pmaddwd mm3,mm7
    332 
    333         add esi,BYTE 32
    334 	por mm4,mm0
    335 
    336 	pand mm5,mm6
    337 	psrld mm4,6
    338 
    339 	movq mm2,[esi+8]
    340 	por mm5,mm3
    341 
    342 	movq mm0,[esi]
    343 	psrld mm5,6
    344 
    345 	movq mm3,mm2
    346 	movq mm1,mm0
    347 
    348 	pand_immq mm3, mmx32_rgb555_rb
    349 	packssdw mm5,mm4
    350 
    351 	pand_immq mm1, mmx32_rgb555_rb
    352 	pand mm2,mm6
    353 
    354 	CLEANUP_IMMQ_LOADS(4)
    355 
    356 	movq [edi+8],mm5
    357 	pmaddwd mm3,mm7
    358 
    359 	pmaddwd mm1,mm7
    360         add edi,BYTE 16
    361 	
    362         sub ecx,BYTE 8
    363 	jz .L2
    364         jmp .L1
    365 
    366 
    367 .L2:	
    368 	mov ecx,edx
    369 	
    370         and ecx,BYTE 7
    371 	jz .L4
    372 	
    373 .L3:	
    374 	mov ebx,[esi]
    375         add esi,BYTE 4
    376 	
    377         mov eax,ebx
    378         mov edx,ebx
    379 
    380         shr eax,3
    381         shr edx,6
    382 
    383         and eax,BYTE 0000000000011111b
    384         and edx,     0000001111100000b
    385 
    386         shr ebx,9
    387 
    388         or eax,edx
    389 
    390         and ebx,     0111110000000000b
    391 
    392         or eax,ebx
    393 
    394         mov [edi],ax
    395         add edi,BYTE 2
    396 
    397 	dec ecx
    398 	jnz .L3	
    399 
    400 .L4:		
    401 	retn
    402 
    403 %ifidn __OUTPUT_FORMAT__,elf32
    404 section .note.GNU-stack noalloc noexec nowrite progbits
    405 %endif
    406