Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
      3 ;
      4 ; x86 SIMD extension for IJG JPEG library
      5 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      6 ; Copyright (C) 2011, D. R. Commander.
      7 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      8 ;
      9 ; This file should be assembled with NASM (Netwide Assembler),
     10 ; can *not* be assembled with Microsoft's MASM or any compatible
     11 ; assembler (including Borland's Turbo Assembler).
     12 ; NASM is available from http://nasm.sourceforge.net/ or
     13 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     14 ;
     15 ; [TAB8]
     16 
     17 %include "jcolsamp.inc"
     18 
     19 ; --------------------------------------------------------------------------
     20 ;
     21 ; Convert some rows of samples to the output colorspace.
     22 ;
     23 ; GLOBAL(void)
     24 ; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
     25 ;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
     26 ;                              JDIMENSION output_row, int num_rows);
     27 ;
     28 
     29 ; r10 = JDIMENSION img_width
     30 ; r11 = JSAMPARRAY input_buf
     31 ; r12 = JSAMPIMAGE output_buf
     32 ; r13 = JDIMENSION output_row
     33 ; r14 = int num_rows
     34 
     35 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
     36 %define WK_NUM          2
     37 
     38         align   16
     39 
     40         global  EXTN(jsimd_rgb_gray_convert_sse2)
     41 
     42 EXTN(jsimd_rgb_gray_convert_sse2):
     43         push    rbp
     44         mov     rax,rsp                         ; rax = original rbp
     45         sub     rsp, byte 4
     46         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
     47         mov     [rsp],rax
     48         mov     rbp,rsp                         ; rbp = aligned rbp
     49         lea     rsp, [wk(0)]
     50         collect_args
     51         push    rbx
     52 
     53         mov     ecx, r10d
     54         test    rcx,rcx
     55         jz      near .return
     56 
     57         push    rcx
     58 
     59         mov rsi, r12
     60         mov ecx, r13d
     61         mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     62         lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     63 
     64         pop     rcx
     65 
     66         mov rsi, r11
     67         mov     eax, r14d
     68         test    rax,rax
     69         jle     near .return
     70 .rowloop:
     71         push    rdi
     72         push    rsi
     73         push    rcx                     ; col
     74 
     75         mov     rsi, JSAMPROW [rsi]     ; inptr
     76         mov     rdi, JSAMPROW [rdi]     ; outptr0
     77 
     78         cmp     rcx, byte SIZEOF_XMMWORD
     79         jae     near .columnloop
     80 
     81 %if RGB_PIXELSIZE == 3 ; ---------------
     82 
     83 .column_ld1:
     84         push    rax
     85         push    rdx
     86         lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
     87         test    cl, SIZEOF_BYTE
     88         jz      short .column_ld2
     89         sub     rcx, byte SIZEOF_BYTE
     90         movzx   rax, BYTE [rsi+rcx]
     91 .column_ld2:
     92         test    cl, SIZEOF_WORD
     93         jz      short .column_ld4
     94         sub     rcx, byte SIZEOF_WORD
     95         movzx   rdx, WORD [rsi+rcx]
     96         shl     rax, WORD_BIT
     97         or      rax,rdx
     98 .column_ld4:
     99         movd    xmmA,eax
    100         pop     rdx
    101         pop     rax
    102         test    cl, SIZEOF_DWORD
    103         jz      short .column_ld8
    104         sub     rcx, byte SIZEOF_DWORD
    105         movd    xmmF, XMM_DWORD [rsi+rcx]
    106         pslldq  xmmA, SIZEOF_DWORD
    107         por     xmmA,xmmF
    108 .column_ld8:
    109         test    cl, SIZEOF_MMWORD
    110         jz      short .column_ld16
    111         sub     rcx, byte SIZEOF_MMWORD
    112         movq    xmmB, XMM_MMWORD [rsi+rcx]
    113         pslldq  xmmA, SIZEOF_MMWORD
    114         por     xmmA,xmmB
    115 .column_ld16:
    116         test    cl, SIZEOF_XMMWORD
    117         jz      short .column_ld32
    118         movdqa  xmmF,xmmA
    119         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    120         mov     rcx, SIZEOF_XMMWORD
    121         jmp     short .rgb_gray_cnv
    122 .column_ld32:
    123         test    cl, 2*SIZEOF_XMMWORD
    124         mov     rcx, SIZEOF_XMMWORD
    125         jz      short .rgb_gray_cnv
    126         movdqa  xmmB,xmmA
    127         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    128         movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    129         jmp     short .rgb_gray_cnv
    130 
    131 .columnloop:
    132         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    133         movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    134         movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    135 
    136 .rgb_gray_cnv:
    137         ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
    138         ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    139         ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
    140 
    141         movdqa    xmmG,xmmA
    142         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
    143         psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
    144 
    145         punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
    146         pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
    147 
    148         punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
    149         punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
    150 
    151         movdqa    xmmD,xmmA
    152         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
    153         psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
    154 
    155         punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
    156         pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
    157 
    158         punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
    159         punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
    160 
    161         movdqa    xmmE,xmmA
    162         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
    163         psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
    164 
    165         punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    166         pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
    167 
    168         punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
    169         punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
    170 
    171         pxor      xmmH,xmmH
    172 
    173         movdqa    xmmC,xmmA
    174         punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
    175         punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
    176 
    177         movdqa    xmmB,xmmE
    178         punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
    179         punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
    180 
    181         movdqa    xmmF,xmmD
    182         punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
    183         punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
    184 
    185 %else ; RGB_PIXELSIZE == 4 ; -----------
    186 
    187 .column_ld1:
    188         test    cl, SIZEOF_XMMWORD/16
    189         jz      short .column_ld2
    190         sub     rcx, byte SIZEOF_XMMWORD/16
    191         movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
    192 .column_ld2:
    193         test    cl, SIZEOF_XMMWORD/8
    194         jz      short .column_ld4
    195         sub     rcx, byte SIZEOF_XMMWORD/8
    196         movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
    197         pslldq  xmmA, SIZEOF_MMWORD
    198         por     xmmA,xmmE
    199 .column_ld4:
    200         test    cl, SIZEOF_XMMWORD/4
    201         jz      short .column_ld8
    202         sub     rcx, byte SIZEOF_XMMWORD/4
    203         movdqa  xmmE,xmmA
    204         movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
    205 .column_ld8:
    206         test    cl, SIZEOF_XMMWORD/2
    207         mov     rcx, SIZEOF_XMMWORD
    208         jz      short .rgb_gray_cnv
    209         movdqa  xmmF,xmmA
    210         movdqa  xmmH,xmmE
    211         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    212         movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    213         jmp     short .rgb_gray_cnv
    214 
    215 .columnloop:
    216         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    217         movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    218         movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    219         movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
    220 
    221 .rgb_gray_cnv:
    222         ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
    223         ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    224         ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
    225         ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    226 
    227         movdqa    xmmD,xmmA
    228         punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
    229         punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
    230 
    231         movdqa    xmmC,xmmF
    232         punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
    233         punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
    234 
    235         movdqa    xmmB,xmmA
    236         punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
    237         punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
    238 
    239         movdqa    xmmG,xmmD
    240         punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
    241         punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
    242 
    243         movdqa    xmmE,xmmA
    244         punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    245         punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
    246 
    247         movdqa    xmmH,xmmB
    248         punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
    249         punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
    250 
    251         pxor      xmmF,xmmF
    252 
    253         movdqa    xmmC,xmmA
    254         punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
    255         punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
    256 
    257         movdqa    xmmD,xmmB
    258         punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
    259         punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
    260 
    261         movdqa    xmmG,xmmE
    262         punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
    263         punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
    264 
    265         punpcklbw xmmF,xmmH
    266         punpckhbw xmmH,xmmH
    267         psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
    268         psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
    269 
    270 %endif ; RGB_PIXELSIZE ; ---------------
    271 
    272         ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
    273         ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
    274 
    275         ; (Original)
    276         ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    277         ;
    278         ; (This implementation)
    279         ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    280 
    281         movdqa    xmm6,xmm1
    282         punpcklwd xmm1,xmm3
    283         punpckhwd xmm6,xmm3
    284         pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    285         pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    286 
    287         movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
    288 
    289         movdqa    xmm6,xmm0
    290         punpcklwd xmm0,xmm2
    291         punpckhwd xmm6,xmm2
    292         pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
    293         pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
    294 
    295         movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
    296         movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
    297 
    298         movdqa    xmm0, xmm5    ; xmm0=BO
    299         movdqa    xmm6, xmm4    ; xmm6=BE
    300 
    301         movdqa    xmm4,xmm0
    302         punpcklwd xmm0,xmm3
    303         punpckhwd xmm4,xmm3
    304         pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    305         pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    306 
    307         movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
    308 
    309         paddd     xmm0, xmm1
    310         paddd     xmm4, xmm7
    311         paddd     xmm0,xmm3
    312         paddd     xmm4,xmm3
    313         psrld     xmm0,SCALEBITS        ; xmm0=YOL
    314         psrld     xmm4,SCALEBITS        ; xmm4=YOH
    315         packssdw  xmm0,xmm4             ; xmm0=YO
    316 
    317         movdqa    xmm4,xmm6
    318         punpcklwd xmm6,xmm2
    319         punpckhwd xmm4,xmm2
    320         pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    321         pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    322 
    323         movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
    324 
    325         paddd     xmm6, XMMWORD [wk(0)]
    326         paddd     xmm4, XMMWORD [wk(1)]
    327         paddd     xmm6,xmm2
    328         paddd     xmm4,xmm2
    329         psrld     xmm6,SCALEBITS        ; xmm6=YEL
    330         psrld     xmm4,SCALEBITS        ; xmm4=YEH
    331         packssdw  xmm6,xmm4             ; xmm6=YE
    332 
    333         psllw     xmm0,BYTE_BIT
    334         por       xmm6,xmm0             ; xmm6=Y
    335         movdqa    XMMWORD [rdi], xmm6   ; Save Y
    336 
    337         sub     rcx, byte SIZEOF_XMMWORD
    338         add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
    339         add     rdi, byte SIZEOF_XMMWORD                ; outptr0
    340         cmp     rcx, byte SIZEOF_XMMWORD
    341         jae     near .columnloop
    342         test    rcx,rcx
    343         jnz     near .column_ld1
    344 
    345         pop     rcx                     ; col
    346         pop     rsi
    347         pop     rdi
    348 
    349         add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
    350         add     rdi, byte SIZEOF_JSAMPROW
    351         dec     rax                             ; num_rows
    352         jg      near .rowloop
    353 
    354 .return:
    355         pop     rbx
    356         uncollect_args
    357         mov     rsp,rbp         ; rsp <- aligned rbp
    358         pop     rsp             ; rsp <- original rbp
    359         pop     rbp
    360         ret
    361 
    362 ; For some reason, the OS X linker does not honor the request to align the
    363 ; segment unless we do this.
    364         align   16
    365