Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcgryext.asm - grayscale colorspace conversion (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2011, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jcolsamp.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 ;
     23 ; Convert some rows of samples to the output colorspace.
     24 ;
     25 ; GLOBAL(void)
     26 ; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
     27 ;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
     28 ;                             JDIMENSION output_row, int num_rows);
     29 ;
     30 
     31 %define img_width(b)    (b)+8           ; JDIMENSION img_width
     32 %define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
     33 %define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
     34 %define output_row(b)   (b)+20          ; JDIMENSION output_row
     35 %define num_rows(b)     (b)+24          ; int num_rows
     36 
     37 %define original_ebp    ebp+0
     38 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
     39 %define WK_NUM          2
     40 %define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
     41 
     42         align   16
     43         global  EXTN(jsimd_rgb_gray_convert_mmx)
     44 
     45 EXTN(jsimd_rgb_gray_convert_mmx):
     46         push    ebp
     47         mov     eax,esp                         ; eax = original ebp
     48         sub     esp, byte 4
     49         and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
     50         mov     [esp],eax
     51         mov     ebp,esp                         ; ebp = aligned ebp
     52         lea     esp, [wk(0)]
     53         pushpic eax             ; make a room for GOT address
     54         push    ebx
     55 ;       push    ecx             ; need not be preserved
     56 ;       push    edx             ; need not be preserved
     57         push    esi
     58         push    edi
     59 
     60         get_GOT ebx                     ; get GOT address
     61         movpic  POINTER [gotptr], ebx   ; save GOT address
     62 
     63         mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
     64         test    ecx,ecx
     65         jz      near .return
     66 
     67         push    ecx
     68 
     69         mov     esi, JSAMPIMAGE [output_buf(eax)]
     70         mov     ecx, JDIMENSION [output_row(eax)]
     71         mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
     72         lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
     73 
     74         pop     ecx
     75 
     76         mov     esi, JSAMPARRAY [input_buf(eax)]
     77         mov     eax, INT [num_rows(eax)]
     78         test    eax,eax
     79         jle     near .return
     80         alignx  16,7
     81 .rowloop:
     82         pushpic eax
     83         push    edi
     84         push    esi
     85         push    ecx                     ; col
     86 
     87         mov     esi, JSAMPROW [esi]     ; inptr
     88         mov     edi, JSAMPROW [edi]     ; outptr0
     89         movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
     90 
     91         cmp     ecx, byte SIZEOF_MMWORD
     92         jae     short .columnloop
     93         alignx  16,7
     94 
     95 %if RGB_PIXELSIZE == 3 ; ---------------
     96 
     97 .column_ld1:
     98         push    eax
     99         push    edx
    100         lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
    101         test    cl, SIZEOF_BYTE
    102         jz      short .column_ld2
    103         sub     ecx, byte SIZEOF_BYTE
    104         xor     eax,eax
    105         mov     al, BYTE [esi+ecx]
    106 .column_ld2:
    107         test    cl, SIZEOF_WORD
    108         jz      short .column_ld4
    109         sub     ecx, byte SIZEOF_WORD
    110         xor     edx,edx
    111         mov     dx, WORD [esi+ecx]
    112         shl     eax, WORD_BIT
    113         or      eax,edx
    114 .column_ld4:
    115         movd    mmA,eax
    116         pop     edx
    117         pop     eax
    118         test    cl, SIZEOF_DWORD
    119         jz      short .column_ld8
    120         sub     ecx, byte SIZEOF_DWORD
    121         movd    mmG, DWORD [esi+ecx]
    122         psllq   mmA, DWORD_BIT
    123         por     mmA,mmG
    124 .column_ld8:
    125         test    cl, SIZEOF_MMWORD
    126         jz      short .column_ld16
    127         movq    mmG,mmA
    128         movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    129         mov     ecx, SIZEOF_MMWORD
    130         jmp     short .rgb_gray_cnv
    131 .column_ld16:
    132         test    cl, 2*SIZEOF_MMWORD
    133         mov     ecx, SIZEOF_MMWORD
    134         jz      short .rgb_gray_cnv
    135         movq    mmF,mmA
    136         movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    137         movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
    138         jmp     short .rgb_gray_cnv
    139         alignx  16,7
    140 
    141 .columnloop:
    142         movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    143         movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
    144         movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
    145 
    146 .rgb_gray_cnv:
    147         ; mmA=(00 10 20 01 11 21 02 12)
    148         ; mmG=(22 03 13 23 04 14 24 05)
    149         ; mmF=(15 25 06 16 26 07 17 27)
    150 
    151         movq      mmD,mmA
    152         psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
    153         psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
    154 
    155         punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
    156         psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
    157 
    158         punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
    159         punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
    160 
    161         movq      mmE,mmA
    162         psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
    163         psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
    164 
    165         punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
    166         psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
    167 
    168         punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
    169         punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
    170 
    171         pxor      mmH,mmH
    172 
    173         movq      mmC,mmA
    174         punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
    175         punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
    176 
    177         movq      mmB,mmE
    178         punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
    179         punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
    180 
    181         movq      mmF,mmD
    182         punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
    183         punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
    184 
    185 %else ; RGB_PIXELSIZE == 4 ; -----------
    186 
    187 .column_ld1:
    188         test    cl, SIZEOF_MMWORD/8
    189         jz      short .column_ld2
    190         sub     ecx, byte SIZEOF_MMWORD/8
    191         movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
    192 .column_ld2:
    193         test    cl, SIZEOF_MMWORD/4
    194         jz      short .column_ld4
    195         sub     ecx, byte SIZEOF_MMWORD/4
    196         movq    mmF,mmA
    197         movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
    198 .column_ld4:
    199         test    cl, SIZEOF_MMWORD/2
    200         mov     ecx, SIZEOF_MMWORD
    201         jz      short .rgb_gray_cnv
    202         movq    mmD,mmA
    203         movq    mmC,mmF
    204         movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    205         movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
    206         jmp     short .rgb_gray_cnv
    207         alignx  16,7
    208 
    209 .columnloop:
    210         movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    211         movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
    212         movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
    213         movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
    214 
    215 .rgb_gray_cnv:
    216         ; mmA=(00 10 20 30 01 11 21 31)
    217         ; mmF=(02 12 22 32 03 13 23 33)
    218         ; mmD=(04 14 24 34 05 15 25 35)
    219         ; mmC=(06 16 26 36 07 17 27 37)
    220 
    221         movq      mmB,mmA
    222         punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
    223         punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
    224 
    225         movq      mmG,mmD
    226         punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
    227         punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
    228 
    229         movq      mmE,mmA
    230         punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
    231         punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
    232 
    233         movq      mmH,mmB
    234         punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
    235         punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
    236 
    237         pxor      mmF,mmF
    238 
    239         movq      mmC,mmA
    240         punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
    241         punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
    242 
    243         movq      mmD,mmB
    244         punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
    245         punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
    246 
    247         movq      mmG,mmE
    248         punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
    249         punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
    250 
    251         punpcklbw mmF,mmH
    252         punpckhbw mmH,mmH
    253         psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
    254         psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
    255 
    256 %endif ; RGB_PIXELSIZE ; ---------------
    257 
    258         ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
    259         ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
    260 
    261         ; (Original)
    262         ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    263         ;
    264         ; (This implementation)
    265         ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    266 
    267         movq      mm6,mm1
    268         punpcklwd mm1,mm3
    269         punpckhwd mm6,mm3
    270         pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    271         pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    272 
    273         movq      mm7, mm6      ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
    274 
    275         movq      mm6,mm0
    276         punpcklwd mm0,mm2
    277         punpckhwd mm6,mm2
    278         pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
    279         pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
    280 
    281         movq      MMWORD [wk(0)], mm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
    282         movq      MMWORD [wk(1)], mm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
    283 
    284         movq      mm0, mm5      ; mm0=BO
    285         movq      mm6, mm4      ; mm6=BE
    286 
    287         movq      mm4,mm0
    288         punpcklwd mm0,mm3
    289         punpckhwd mm4,mm3
    290         pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    291         pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    292 
    293         movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
    294 
    295         paddd     mm0, mm1
    296         paddd     mm4, mm7
    297         paddd     mm0,mm3
    298         paddd     mm4,mm3
    299         psrld     mm0,SCALEBITS         ; mm0=YOL
    300         psrld     mm4,SCALEBITS         ; mm4=YOH
    301         packssdw  mm0,mm4               ; mm0=YO
    302 
    303         movq      mm4,mm6
    304         punpcklwd mm6,mm2
    305         punpckhwd mm4,mm2
    306         pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    307         pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    308 
    309         movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
    310 
    311         paddd     mm6, MMWORD [wk(0)]
    312         paddd     mm4, MMWORD [wk(1)]
    313         paddd     mm6,mm2
    314         paddd     mm4,mm2
    315         psrld     mm6,SCALEBITS         ; mm6=YEL
    316         psrld     mm4,SCALEBITS         ; mm4=YEH
    317         packssdw  mm6,mm4               ; mm6=YE
    318 
    319         psllw     mm0,BYTE_BIT
    320         por       mm6,mm0               ; mm6=Y
    321         movq      MMWORD [edi], mm6     ; Save Y
    322 
    323         sub     ecx, byte SIZEOF_MMWORD
    324         add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
    325         add     edi, byte SIZEOF_MMWORD                 ; outptr0
    326         cmp     ecx, byte SIZEOF_MMWORD
    327         jae     near .columnloop
    328         test    ecx,ecx
    329         jnz     near .column_ld1
    330 
    331         pop     ecx                     ; col
    332         pop     esi
    333         pop     edi
    334         poppic  eax
    335 
    336         add     esi, byte SIZEOF_JSAMPROW       ; input_buf
    337         add     edi, byte SIZEOF_JSAMPROW
    338         dec     eax                             ; num_rows
    339         jg      near .rowloop
    340 
    341         emms            ; empty MMX state
    342 
    343 .return:
    344         pop     edi
    345         pop     esi
    346 ;       pop     edx             ; need not be preserved
    347 ;       pop     ecx             ; need not be preserved
    348         pop     ebx
    349         mov     esp,ebp         ; esp <- aligned ebp
    350         pop     esp             ; esp <- original ebp
    351         pop     ebp
    352         ret
    353 
    354 ; For some reason, the OS X linker does not honor the request to align the
    355 ; segment unless we do this.
    356         align   16
    357