Home | History | Annotate | Download | only in x86_64
      1 ;
      2 ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
      3 ;
      4 ; Copyright (C) 2011, 2016, D. R. Commander.
      5 ;
      6 ; Based on the x86 SIMD extension for IJG JPEG library
      7 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      9 ;
     10 ; This file should be assembled with NASM (Netwide Assembler),
     11 ; can *not* be assembled with Microsoft's MASM or any compatible
     12 ; assembler (including Borland's Turbo Assembler).
     13 ; NASM is available from http://nasm.sourceforge.net/ or
     14 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     15 ;
     16 ; [TAB8]
     17 
     18 %include "jcolsamp.inc"
     19 
     20 ; --------------------------------------------------------------------------
     21 ;
     22 ; Convert some rows of samples to the output colorspace.
     23 ;
     24 ; GLOBAL(void)
     25 ; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
     26 ;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
     27 ;                             int num_rows);
     28 ;
     29 
     30 ; r10d = JDIMENSION img_width
     31 ; r11 = JSAMPARRAY input_buf
     32 ; r12 = JSAMPIMAGE output_buf
     33 ; r13d = JDIMENSION output_row
     34 ; r14d = int num_rows
     35 
     36 %define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
     37 %define WK_NUM  2
     38 
     39     align       32
     40     GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
     41 
     42 EXTN(jsimd_rgb_gray_convert_sse2):
     43     push        rbp
     44     mov         rax, rsp                     ; rax = original rbp
     45     sub         rsp, byte 4
     46     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
     47     mov         [rsp], rax
     48     mov         rbp, rsp                     ; rbp = aligned rbp
     49     lea         rsp, [wk(0)]
     50     collect_args 5
     51     push        rbx
     52 
     53     mov         ecx, r10d
     54     test        rcx, rcx
     55     jz          near .return
     56 
     57     push        rcx
     58 
     59     mov         rsi, r12
     60     mov         ecx, r13d
     61     mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     62     lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     63 
     64     pop         rcx
     65 
     66     mov         rsi, r11
     67     mov         eax, r14d
     68     test        rax, rax
     69     jle         near .return
     70 .rowloop:
     71     push        rdi
     72     push        rsi
     73     push        rcx                     ; col
     74 
     75     mov         rsi, JSAMPROW [rsi]     ; inptr
     76     mov         rdi, JSAMPROW [rdi]     ; outptr0
     77 
     78     cmp         rcx, byte SIZEOF_XMMWORD
     79     jae         near .columnloop
     80 
     81 %if RGB_PIXELSIZE == 3  ; ---------------
     82 
     83 .column_ld1:
     84     push        rax
     85     push        rdx
     86     lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
     87     test        cl, SIZEOF_BYTE
     88     jz          short .column_ld2
     89     sub         rcx, byte SIZEOF_BYTE
     90     movzx       rax, BYTE [rsi+rcx]
     91 .column_ld2:
     92     test        cl, SIZEOF_WORD
     93     jz          short .column_ld4
     94     sub         rcx, byte SIZEOF_WORD
     95     movzx       rdx, WORD [rsi+rcx]
     96     shl         rax, WORD_BIT
     97     or          rax, rdx
     98 .column_ld4:
     99     movd        xmmA, eax
    100     pop         rdx
    101     pop         rax
    102     test        cl, SIZEOF_DWORD
    103     jz          short .column_ld8
    104     sub         rcx, byte SIZEOF_DWORD
    105     movd        xmmF, XMM_DWORD [rsi+rcx]
    106     pslldq      xmmA, SIZEOF_DWORD
    107     por         xmmA, xmmF
    108 .column_ld8:
    109     test        cl, SIZEOF_MMWORD
    110     jz          short .column_ld16
    111     sub         rcx, byte SIZEOF_MMWORD
    112     movq        xmmB, XMM_MMWORD [rsi+rcx]
    113     pslldq      xmmA, SIZEOF_MMWORD
    114     por         xmmA, xmmB
    115 .column_ld16:
    116     test        cl, SIZEOF_XMMWORD
    117     jz          short .column_ld32
    118     movdqa      xmmF, xmmA
    119     movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    120     mov         rcx, SIZEOF_XMMWORD
    121     jmp         short .rgb_gray_cnv
    122 .column_ld32:
    123     test        cl, 2*SIZEOF_XMMWORD
    124     mov         rcx, SIZEOF_XMMWORD
    125     jz          short .rgb_gray_cnv
    126     movdqa      xmmB, xmmA
    127     movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    128     movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    129     jmp         short .rgb_gray_cnv
    130 
    131 .columnloop:
    132     movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    133     movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    134     movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    135 
    136 .rgb_gray_cnv:
    137     ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
    138     ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    139     ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
    140 
    141     movdqa      xmmG, xmmA
    142     pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
    143     psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
    144 
    145     punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
    146     pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
    147 
    148     punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
    149     punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
    150 
    151     movdqa      xmmD, xmmA
    152     pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
    153     psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
    154 
    155     punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
    156     pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
    157 
    158     punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
    159     punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
    160 
    161     movdqa      xmmE, xmmA
    162     pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
    163     psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
    164 
    165     punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    166     pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
    167 
    168     punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
    169     punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
    170 
    171     pxor        xmmH, xmmH
    172 
    173     movdqa      xmmC, xmmA
    174     punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
    175     punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
    176 
    177     movdqa      xmmB, xmmE
    178     punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
    179     punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
    180 
    181     movdqa      xmmF, xmmD
    182     punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
    183     punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
    184 
    185 %else  ; RGB_PIXELSIZE == 4 ; -----------
    186 
    187 .column_ld1:
    188     test        cl, SIZEOF_XMMWORD/16
    189     jz          short .column_ld2
    190     sub         rcx, byte SIZEOF_XMMWORD/16
    191     movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
    192 .column_ld2:
    193     test        cl, SIZEOF_XMMWORD/8
    194     jz          short .column_ld4
    195     sub         rcx, byte SIZEOF_XMMWORD/8
    196     movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
    197     pslldq      xmmA, SIZEOF_MMWORD
    198     por         xmmA, xmmE
    199 .column_ld4:
    200     test        cl, SIZEOF_XMMWORD/4
    201     jz          short .column_ld8
    202     sub         rcx, byte SIZEOF_XMMWORD/4
    203     movdqa      xmmE, xmmA
    204     movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
    205 .column_ld8:
    206     test        cl, SIZEOF_XMMWORD/2
    207     mov         rcx, SIZEOF_XMMWORD
    208     jz          short .rgb_gray_cnv
    209     movdqa      xmmF, xmmA
    210     movdqa      xmmH, xmmE
    211     movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    212     movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    213     jmp         short .rgb_gray_cnv
    214 
    215 .columnloop:
    216     movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    217     movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    218     movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    219     movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
    220 
    221 .rgb_gray_cnv:
    222     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
    223     ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    224     ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
    225     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    226 
    227     movdqa      xmmD, xmmA
    228     punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
    229     punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
    230 
    231     movdqa      xmmC, xmmF
    232     punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
    233     punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
    234 
    235     movdqa      xmmB, xmmA
    236     punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
    237     punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
    238 
    239     movdqa      xmmG, xmmD
    240     punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
    241     punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
    242 
    243     movdqa      xmmE, xmmA
    244     punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    245     punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
    246 
    247     movdqa      xmmH, xmmB
    248     punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
    249     punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
    250 
    251     pxor        xmmF, xmmF
    252 
    253     movdqa      xmmC, xmmA
    254     punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
    255     punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
    256 
    257     movdqa      xmmD, xmmB
    258     punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
    259     punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
    260 
    261     movdqa      xmmG, xmmE
    262     punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
    263     punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
    264 
    265     punpcklbw   xmmF, xmmH
    266     punpckhbw   xmmH, xmmH
    267     psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
    268     psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
    269 
    270 %endif  ; RGB_PIXELSIZE ; ---------------
    271 
    272     ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
    273     ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
    274 
    275     ; (Original)
    276     ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    277     ;
    278     ; (This implementation)
    279     ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    280 
    281     movdqa      xmm6, xmm1
    282     punpcklwd   xmm1, xmm3
    283     punpckhwd   xmm6, xmm3
    284     pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    285     pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    286 
    287     movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
    288 
    289     movdqa      xmm6, xmm0
    290     punpcklwd   xmm0, xmm2
    291     punpckhwd   xmm6, xmm2
    292     pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
    293     pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
    294 
    295     movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
    296     movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
    297 
    298     movdqa      xmm0, xmm5              ; xmm0=BO
    299     movdqa      xmm6, xmm4              ; xmm6=BE
    300 
    301     movdqa      xmm4, xmm0
    302     punpcklwd   xmm0, xmm3
    303     punpckhwd   xmm4, xmm3
    304     pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    305     pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    306 
    307     movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
    308 
    309     paddd       xmm0, xmm1
    310     paddd       xmm4, xmm7
    311     paddd       xmm0, xmm3
    312     paddd       xmm4, xmm3
    313     psrld       xmm0, SCALEBITS         ; xmm0=YOL
    314     psrld       xmm4, SCALEBITS         ; xmm4=YOH
    315     packssdw    xmm0, xmm4              ; xmm0=YO
    316 
    317     movdqa      xmm4, xmm6
    318     punpcklwd   xmm6, xmm2
    319     punpckhwd   xmm4, xmm2
    320     pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    321     pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    322 
    323     movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
    324 
    325     paddd       xmm6, XMMWORD [wk(0)]
    326     paddd       xmm4, XMMWORD [wk(1)]
    327     paddd       xmm6, xmm2
    328     paddd       xmm4, xmm2
    329     psrld       xmm6, SCALEBITS         ; xmm6=YEL
    330     psrld       xmm4, SCALEBITS         ; xmm4=YEH
    331     packssdw    xmm6, xmm4              ; xmm6=YE
    332 
    333     psllw       xmm0, BYTE_BIT
    334     por         xmm6, xmm0              ; xmm6=Y
    335     movdqa      XMMWORD [rdi], xmm6     ; Save Y
    336 
    337     sub         rcx, byte SIZEOF_XMMWORD
    338     add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
    339     add         rdi, byte SIZEOF_XMMWORD                ; outptr0
    340     cmp         rcx, byte SIZEOF_XMMWORD
    341     jae         near .columnloop
    342     test        rcx, rcx
    343     jnz         near .column_ld1
    344 
    345     pop         rcx                     ; col
    346     pop         rsi
    347     pop         rdi
    348 
    349     add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
    350     add         rdi, byte SIZEOF_JSAMPROW
    351     dec         rax                        ; num_rows
    352     jg          near .rowloop
    353 
    354 .return:
    355     pop         rbx
    356     uncollect_args 5
    357     mov         rsp, rbp                ; rsp <- aligned rbp
    358     pop         rsp                     ; rsp <- original rbp
    359     pop         rbp
    360     ret
    361 
    362 ; For some reason, the OS X linker does not honor the request to align the
    363 ; segment unless we do this.
    364     align       32
    365