Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jccolext.asm - colorspace conversion (64-bit SSE2)
      3 ;
      4 ; x86 SIMD extension for IJG JPEG library
      5 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      6 ; Copyright (C) 2009, D. R. Commander.
      7 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      8 ;
      9 ; This file should be assembled with NASM (Netwide Assembler),
     10 ; can *not* be assembled with Microsoft's MASM or any compatible
     11 ; assembler (including Borland's Turbo Assembler).
     12 ; NASM is available from http://nasm.sourceforge.net/ or
     13 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     14 ;
     15 ; [TAB8]
     16 
     17 %include "jcolsamp.inc"
     18 
     19 ; --------------------------------------------------------------------------
     20 ;
     21 ; Convert some rows of samples to the output colorspace.
     22 ;
     23 ; GLOBAL(void)
     24 ; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
     25 ;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
     26 ;                             JDIMENSION output_row, int num_rows);
     27 ;
     28 
     29 ; r10 = JDIMENSION img_width
     30 ; r11 = JSAMPARRAY input_buf
     31 ; r12 = JSAMPIMAGE output_buf
     32 ; r13 = JDIMENSION output_row
     33 ; r14 = int num_rows
     34 
     35 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
     36 %define WK_NUM          8
     37 
     38         align   16
     39 
     40         global  EXTN(jsimd_rgb_ycc_convert_sse2)
     41 
     42 EXTN(jsimd_rgb_ycc_convert_sse2):
     43         push    rbp
     44         mov     rax,rsp                         ; rax = original rbp
     45         sub     rsp, byte 4
     46         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
     47         mov     [rsp],rax
     48         mov     rbp,rsp                         ; rbp = aligned rbp
     49         lea     rsp, [wk(0)]
     50         collect_args
     51         push    rbx
     52 
     53         mov     ecx, r10d
     54         test    rcx,rcx
     55         jz      near .return
     56 
     57         push    rcx
     58 
     59         mov rsi, r12
     60         mov ecx, r13d
     61         mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     62         mov     rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
     63         mov     rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
     64         lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     65         lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     66         lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
     67 
     68         pop     rcx
     69 
     70         mov rsi, r11
     71         mov     eax, r14d
     72         test    rax,rax
     73         jle     near .return
     74 .rowloop:
     75         push    rdx
     76         push    rbx
     77         push    rdi
     78         push    rsi
     79         push    rcx                     ; col
     80 
     81         mov     rsi, JSAMPROW [rsi]     ; inptr
     82         mov     rdi, JSAMPROW [rdi]     ; outptr0
     83         mov     rbx, JSAMPROW [rbx]     ; outptr1
     84         mov     rdx, JSAMPROW [rdx]     ; outptr2
     85 
     86         cmp     rcx, byte SIZEOF_XMMWORD
     87         jae     near .columnloop
     88 
     89 %if RGB_PIXELSIZE == 3 ; ---------------
     90 
     91 .column_ld1:
     92         push    rax
     93         push    rdx
     94         lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
     95         test    cl, SIZEOF_BYTE
     96         jz      short .column_ld2
     97         sub     rcx, byte SIZEOF_BYTE
     98         movzx   rax, BYTE [rsi+rcx]
     99 .column_ld2:
    100         test    cl, SIZEOF_WORD
    101         jz      short .column_ld4
    102         sub     rcx, byte SIZEOF_WORD
    103         movzx   rdx, WORD [rsi+rcx]
    104         shl     rax, WORD_BIT
    105         or      rax,rdx
    106 .column_ld4:
    107         movd    xmmA,eax
    108         pop     rdx
    109         pop     rax
    110         test    cl, SIZEOF_DWORD
    111         jz      short .column_ld8
    112         sub     rcx, byte SIZEOF_DWORD
    113         movd    xmmF, XMM_DWORD [rsi+rcx]
    114         pslldq  xmmA, SIZEOF_DWORD
    115         por     xmmA,xmmF
    116 .column_ld8:
    117         test    cl, SIZEOF_MMWORD
    118         jz      short .column_ld16
    119         sub     rcx, byte SIZEOF_MMWORD
    120         movq    xmmB, XMM_MMWORD [rsi+rcx]
    121         pslldq  xmmA, SIZEOF_MMWORD
    122         por     xmmA,xmmB
    123 .column_ld16:
    124         test    cl, SIZEOF_XMMWORD
    125         jz      short .column_ld32
    126         movdqa  xmmF,xmmA
    127         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    128         mov     rcx, SIZEOF_XMMWORD
    129         jmp     short .rgb_ycc_cnv
    130 .column_ld32:
    131         test    cl, 2*SIZEOF_XMMWORD
    132         mov     rcx, SIZEOF_XMMWORD
    133         jz      short .rgb_ycc_cnv
    134         movdqa  xmmB,xmmA
    135         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    136         movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    137         jmp     short .rgb_ycc_cnv
    138 
    139 .columnloop:
    140         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    141         movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    142         movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    143 
    144 .rgb_ycc_cnv:
    145         ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
    146         ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    147         ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
    148 
    149         movdqa    xmmG,xmmA
    150         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
    151         psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
    152 
    153         punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
    154         pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
    155 
    156         punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
    157         punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
    158 
    159         movdqa    xmmD,xmmA
    160         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
    161         psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
    162 
    163         punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
    164         pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
    165 
    166         punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
    167         punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
    168 
    169         movdqa    xmmE,xmmA
    170         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
    171         psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
    172 
    173         punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    174         pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
    175 
    176         punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
    177         punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
    178 
    179         pxor      xmmH,xmmH
    180 
    181         movdqa    xmmC,xmmA
    182         punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
    183         punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
    184 
    185         movdqa    xmmB,xmmE
    186         punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
    187         punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
    188 
    189         movdqa    xmmF,xmmD
    190         punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
    191         punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
    192 
    193 %else ; RGB_PIXELSIZE == 4 ; -----------
    194 
    195 .column_ld1:
    196         test    cl, SIZEOF_XMMWORD/16
    197         jz      short .column_ld2
    198         sub     rcx, byte SIZEOF_XMMWORD/16
    199         movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
    200 .column_ld2:
    201         test    cl, SIZEOF_XMMWORD/8
    202         jz      short .column_ld4
    203         sub     rcx, byte SIZEOF_XMMWORD/8
    204         movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
    205         pslldq  xmmA, SIZEOF_MMWORD
    206         por     xmmA,xmmE
    207 .column_ld4:
    208         test    cl, SIZEOF_XMMWORD/4
    209         jz      short .column_ld8
    210         sub     rcx, byte SIZEOF_XMMWORD/4
    211         movdqa  xmmE,xmmA
    212         movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
    213 .column_ld8:
    214         test    cl, SIZEOF_XMMWORD/2
    215         mov     rcx, SIZEOF_XMMWORD
    216         jz      short .rgb_ycc_cnv
    217         movdqa  xmmF,xmmA
    218         movdqa  xmmH,xmmE
    219         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    220         movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    221         jmp     short .rgb_ycc_cnv
    222 
    223 .columnloop:
    224         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    225         movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    226         movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    227         movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
    228 
    229 .rgb_ycc_cnv:
    230         ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
    231         ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    232         ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
    233         ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    234 
    235         movdqa    xmmD,xmmA
    236         punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
    237         punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
    238 
    239         movdqa    xmmC,xmmF
    240         punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
    241         punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
    242 
    243         movdqa    xmmB,xmmA
    244         punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
    245         punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
    246 
    247         movdqa    xmmG,xmmD
    248         punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
    249         punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
    250 
    251         movdqa    xmmE,xmmA
    252         punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    253         punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
    254 
    255         movdqa    xmmH,xmmB
    256         punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
    257         punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
    258 
    259         pxor      xmmF,xmmF
    260 
    261         movdqa    xmmC,xmmA
    262         punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
    263         punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
    264 
    265         movdqa    xmmD,xmmB
    266         punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
    267         punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
    268 
    269         movdqa    xmmG,xmmE
    270         punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
    271         punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
    272 
    273         punpcklbw xmmF,xmmH
    274         punpckhbw xmmH,xmmH
    275         psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
    276         psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
    277 
    278 %endif ; RGB_PIXELSIZE ; ---------------
    279 
    280         ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
    281         ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
    282 
    283         ; (Original)
    284         ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    285         ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    286         ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    287         ;
    288         ; (This implementation)
    289         ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    290         ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    291         ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    292 
    293         movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
    294         movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
    295         movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
    296         movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
    297 
    298         movdqa    xmm6,xmm1
    299         punpcklwd xmm1,xmm3
    300         punpckhwd xmm6,xmm3
    301         movdqa    xmm7,xmm1
    302         movdqa    xmm4,xmm6
    303         pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    304         pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    305         pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
    306         pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
    307 
    308         movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
    309         movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
    310 
    311         pxor      xmm1,xmm1
    312         pxor      xmm6,xmm6
    313         punpcklwd xmm1,xmm5             ; xmm1=BOL
    314         punpckhwd xmm6,xmm5             ; xmm6=BOH
    315         psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
    316         psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
    317 
    318         movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
    319 
    320         paddd     xmm7,xmm1
    321         paddd     xmm4,xmm6
    322         paddd     xmm7,xmm5
    323         paddd     xmm4,xmm5
    324         psrld     xmm7,SCALEBITS        ; xmm7=CbOL
    325         psrld     xmm4,SCALEBITS        ; xmm4=CbOH
    326         packssdw  xmm7,xmm4             ; xmm7=CbO
    327 
    328         movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
    329 
    330         movdqa    xmm6,xmm0
    331         punpcklwd xmm0,xmm2
    332         punpckhwd xmm6,xmm2
    333         movdqa    xmm5,xmm0
    334         movdqa    xmm4,xmm6
    335         pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
    336         pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
    337         pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
    338         pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
    339 
    340         movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
    341         movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
    342 
    343         pxor      xmm0,xmm0
    344         pxor      xmm6,xmm6
    345         punpcklwd xmm0,xmm1             ; xmm0=BEL
    346         punpckhwd xmm6,xmm1             ; xmm6=BEH
    347         psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
    348         psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
    349 
    350         movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
    351 
    352         paddd     xmm5,xmm0
    353         paddd     xmm4,xmm6
    354         paddd     xmm5,xmm1
    355         paddd     xmm4,xmm1
    356         psrld     xmm5,SCALEBITS        ; xmm5=CbEL
    357         psrld     xmm4,SCALEBITS        ; xmm4=CbEH
    358         packssdw  xmm5,xmm4             ; xmm5=CbE
    359 
    360         psllw     xmm7,BYTE_BIT
    361         por       xmm5,xmm7             ; xmm5=Cb
    362         movdqa    XMMWORD [rbx], xmm5   ; Save Cb
    363 
    364         movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
    365         movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
    366         movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
    367 
    368         movdqa    xmm4,xmm0
    369         punpcklwd xmm0,xmm3
    370         punpckhwd xmm4,xmm3
    371         movdqa    xmm7,xmm0
    372         movdqa    xmm5,xmm4
    373         pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    374         pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    375         pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
    376         pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
    377 
    378         movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
    379 
    380         paddd     xmm0, XMMWORD [wk(4)]
    381         paddd     xmm4, XMMWORD [wk(5)]
    382         paddd     xmm0,xmm3
    383         paddd     xmm4,xmm3
    384         psrld     xmm0,SCALEBITS        ; xmm0=YOL
    385         psrld     xmm4,SCALEBITS        ; xmm4=YOH
    386         packssdw  xmm0,xmm4             ; xmm0=YO
    387 
    388         pxor      xmm3,xmm3
    389         pxor      xmm4,xmm4
    390         punpcklwd xmm3,xmm1             ; xmm3=ROL
    391         punpckhwd xmm4,xmm1             ; xmm4=ROH
    392         psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
    393         psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
    394 
    395         movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
    396 
    397         paddd     xmm7,xmm3
    398         paddd     xmm5,xmm4
    399         paddd     xmm7,xmm1
    400         paddd     xmm5,xmm1
    401         psrld     xmm7,SCALEBITS        ; xmm7=CrOL
    402         psrld     xmm5,SCALEBITS        ; xmm5=CrOH
    403         packssdw  xmm7,xmm5             ; xmm7=CrO
    404 
    405         movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
    406 
    407         movdqa    xmm4,xmm6
    408         punpcklwd xmm6,xmm2
    409         punpckhwd xmm4,xmm2
    410         movdqa    xmm1,xmm6
    411         movdqa    xmm5,xmm4
    412         pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    413         pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    414         pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
    415         pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
    416 
    417         movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
    418 
    419         paddd     xmm6, XMMWORD [wk(6)]
    420         paddd     xmm4, XMMWORD [wk(7)]
    421         paddd     xmm6,xmm2
    422         paddd     xmm4,xmm2
    423         psrld     xmm6,SCALEBITS        ; xmm6=YEL
    424         psrld     xmm4,SCALEBITS        ; xmm4=YEH
    425         packssdw  xmm6,xmm4             ; xmm6=YE
    426 
    427         psllw     xmm0,BYTE_BIT
    428         por       xmm6,xmm0             ; xmm6=Y
    429         movdqa    XMMWORD [rdi], xmm6   ; Save Y
    430 
    431         pxor      xmm2,xmm2
    432         pxor      xmm4,xmm4
    433         punpcklwd xmm2,xmm3             ; xmm2=REL
    434         punpckhwd xmm4,xmm3             ; xmm4=REH
    435         psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
    436         psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
    437 
    438         movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
    439 
    440         paddd     xmm1,xmm2
    441         paddd     xmm5,xmm4
    442         paddd     xmm1,xmm0
    443         paddd     xmm5,xmm0
    444         psrld     xmm1,SCALEBITS        ; xmm1=CrEL
    445         psrld     xmm5,SCALEBITS        ; xmm5=CrEH
    446         packssdw  xmm1,xmm5             ; xmm1=CrE
    447 
    448         psllw     xmm7,BYTE_BIT
    449         por       xmm1,xmm7             ; xmm1=Cr
    450         movdqa    XMMWORD [rdx], xmm1   ; Save Cr
    451 
    452         sub     rcx, byte SIZEOF_XMMWORD
    453         add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
    454         add     rdi, byte SIZEOF_XMMWORD                ; outptr0
    455         add     rbx, byte SIZEOF_XMMWORD                ; outptr1
    456         add     rdx, byte SIZEOF_XMMWORD                ; outptr2
    457         cmp     rcx, byte SIZEOF_XMMWORD
    458         jae     near .columnloop
    459         test    rcx,rcx
    460         jnz     near .column_ld1
    461 
    462         pop     rcx                     ; col
    463         pop     rsi
    464         pop     rdi
    465         pop     rbx
    466         pop     rdx
    467 
    468         add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
    469         add     rdi, byte SIZEOF_JSAMPROW
    470         add     rbx, byte SIZEOF_JSAMPROW
    471         add     rdx, byte SIZEOF_JSAMPROW
    472         dec     rax                             ; num_rows
    473         jg      near .rowloop
    474 
    475 .return:
    476         pop     rbx
    477         uncollect_args
    478         mov     rsp,rbp         ; rsp <- aligned rbp
    479         pop     rsp             ; rsp <- original rbp
    480         pop     rbp
    481         ret
    482 
    483 ; For some reason, the OS X linker does not honor the request to align the
    484 ; segment unless we do this.
    485         align   16
    486