Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcclrss2-64.asm - colorspace conversion (64-bit SSE2)
      3 ;
      4 ; x86 SIMD extension for IJG JPEG library
      5 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      6 ; Copyright (C) 2009, D. R. Commander.
      7 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      8 ;
      9 ; This file should be assembled with NASM (Netwide Assembler),
     10 ; can *not* be assembled with Microsoft's MASM or any compatible
     11 ; assembler (including Borland's Turbo Assembler).
     12 ; NASM is available from http://nasm.sourceforge.net/ or
     13 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     14 ;
     15 ; [TAB8]
     16 
     17 %include "jcolsamp.inc"
     18 
     19 ; --------------------------------------------------------------------------
     20 ;
     21 ; Convert some rows of samples to the output colorspace.
     22 ;
     23 ; GLOBAL(void)
     24 ; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
     25 ;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
     26 ;                             JDIMENSION output_row, int num_rows);
     27 ;
     28 
     29 ; r10 = JDIMENSION img_width
     30 ; r11 = JSAMPARRAY input_buf
     31 ; r12 = JSAMPIMAGE output_buf
     32 ; r13 = JDIMENSION output_row
     33 ; r14 = int num_rows
     34 
     35 %define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
     36 %define WK_NUM		8
     37 
     38 	align	16
     39 
     40 	global	EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
     41 
     42 EXTN(jsimd_rgb_ycc_convert_sse2):
     43 	push	rbp
     44 	mov	rax,rsp				; rax = original rbp
     45 	sub	rsp, byte 4
     46 	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
     47 	mov	[rsp],rax
     48 	mov	rbp,rsp				; rbp = aligned rbp
     49 	lea	rsp, [wk(0)]
     50 	collect_args
     51 	push	rbx
     52 
     53 	mov	rcx, r10
     54 	test	rcx,rcx
     55 	jz	near .return
     56 
     57 	push	rcx
     58 
     59 	mov rsi, r12
     60 	mov rcx, r13
     61 	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     62 	mov	rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
     63 	mov	rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
     64 	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     65 	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     66 	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
     67 
     68 	pop	rcx
     69 
     70 	mov rsi, r11
     71 	mov	eax, r14d
     72 	test	rax,rax
     73 	jle	near .return
     74 .rowloop:
     75 	push	rdx
     76 	push	rbx
     77 	push	rdi
     78 	push	rsi
     79 	push	rcx			; col
     80 
     81 	mov	rsi, JSAMPROW [rsi]	; inptr
     82 	mov	rdi, JSAMPROW [rdi]	; outptr0
     83 	mov	rbx, JSAMPROW [rbx]	; outptr1
     84 	mov	rdx, JSAMPROW [rdx]	; outptr2
     85 
     86 	cmp	rcx, byte SIZEOF_XMMWORD
     87 	jae	near .columnloop
     88 
     89 %if RGB_PIXELSIZE == 3 ; ---------------
     90 
     91 .column_ld1:
     92 	push	rax
     93 	push	rdx
     94 	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
     95 	test	cl, SIZEOF_BYTE
     96 	jz	short .column_ld2
     97 	sub	rcx, byte SIZEOF_BYTE
     98 	movzx	rax, BYTE [rsi+rcx]
     99 .column_ld2:
    100 	test	cl, SIZEOF_WORD
    101 	jz	short .column_ld4
    102 	sub	rcx, byte SIZEOF_WORD
    103 	movzx	rdx, WORD [rsi+rcx]
    104 	shl	rax, WORD_BIT
    105 	or	rax,rdx
    106 .column_ld4:
    107 	movd	xmmA,eax
    108 	pop	rdx
    109 	pop	rax
    110 	test	cl, SIZEOF_DWORD
    111 	jz	short .column_ld8
    112 	sub	rcx, byte SIZEOF_DWORD
    113 	movd	xmmF, XMM_DWORD [rsi+rcx]
    114 	pslldq	xmmA, SIZEOF_DWORD
    115 	por	xmmA,xmmF
    116 .column_ld8:
    117 	test	cl, SIZEOF_MMWORD
    118 	jz	short .column_ld16
    119 	sub	rcx, byte SIZEOF_MMWORD
    120 	movq	xmmB, XMM_MMWORD [rsi+rcx]
    121 	pslldq	xmmA, SIZEOF_MMWORD
    122 	por	xmmA,xmmB
    123 .column_ld16:
    124 	test	cl, SIZEOF_XMMWORD
    125 	jz	short .column_ld32
    126 	movdqa	xmmF,xmmA
    127 	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    128 	mov	rcx, SIZEOF_XMMWORD
    129 	jmp	short .rgb_ycc_cnv
    130 .column_ld32:
    131 	test	cl, 2*SIZEOF_XMMWORD
    132 	mov	rcx, SIZEOF_XMMWORD
    133 	jz	short .rgb_ycc_cnv
    134 	movdqa	xmmB,xmmA
    135 	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    136 	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    137 	jmp	short .rgb_ycc_cnv
    138 
    139 .columnloop:
    140 	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    141 	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    142 	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    143 
    144 .rgb_ycc_cnv:
    145 	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
    146 	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    147 	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
    148 
    149 	movdqa    xmmG,xmmA
    150 	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
    151 	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
    152 
    153 	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
    154 	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
    155 
    156 	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
    157 	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
    158 
    159 	movdqa    xmmD,xmmA
    160 	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
    161 	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
    162 
    163 	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
    164 	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
    165 
    166 	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
    167 	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
    168 
    169 	movdqa    xmmE,xmmA
    170 	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
    171 	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
    172 
    173 	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    174 	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
    175 
    176 	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
    177 	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
    178 
    179 	pxor      xmmH,xmmH
    180 
    181 	movdqa    xmmC,xmmA
    182 	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
    183 	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
    184 
    185 	movdqa    xmmB,xmmE
    186 	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
    187 	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
    188 
    189 	movdqa    xmmF,xmmD
    190 	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
    191 	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
    192 
    193 %else ; RGB_PIXELSIZE == 4 ; -----------
    194 
    195 .column_ld1:
    196 	test	cl, SIZEOF_XMMWORD/16
    197 	jz	short .column_ld2
    198 	sub	rcx, byte SIZEOF_XMMWORD/16
    199 	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
    200 .column_ld2:
    201 	test	cl, SIZEOF_XMMWORD/8
    202 	jz	short .column_ld4
    203 	sub	rcx, byte SIZEOF_XMMWORD/8
    204 	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
    205 	pslldq	xmmA, SIZEOF_MMWORD
    206 	por	xmmA,xmmE
    207 .column_ld4:
    208 	test	cl, SIZEOF_XMMWORD/4
    209 	jz	short .column_ld8
    210 	sub	rcx, byte SIZEOF_XMMWORD/4
    211 	movdqa	xmmE,xmmA
    212 	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
    213 .column_ld8:
    214 	test	cl, SIZEOF_XMMWORD/2
    215 	mov	rcx, SIZEOF_XMMWORD
    216 	jz	short .rgb_ycc_cnv
    217 	movdqa	xmmF,xmmA
    218 	movdqa	xmmH,xmmE
    219 	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    220 	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    221 	jmp	short .rgb_ycc_cnv
    222 
    223 .columnloop:
    224 	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    225 	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    226 	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    227 	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
    228 
    229 .rgb_ycc_cnv:
    230 	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
    231 	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    232 	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
    233 	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    234 
    235 	movdqa    xmmD,xmmA
    236 	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
    237 	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
    238 
    239 	movdqa    xmmC,xmmF
    240 	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
    241 	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
    242 
    243 	movdqa    xmmB,xmmA
    244 	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
    245 	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
    246 
    247 	movdqa    xmmG,xmmD
    248 	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
    249 	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
    250 
    251 	movdqa    xmmE,xmmA
    252 	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    253 	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
    254 
    255 	movdqa    xmmH,xmmB
    256 	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
    257 	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
    258 
    259 	pxor      xmmF,xmmF
    260 
    261 	movdqa    xmmC,xmmA
    262 	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
    263 	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
    264 
    265 	movdqa    xmmD,xmmB
    266 	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
    267 	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
    268 
    269 	movdqa    xmmG,xmmE
    270 	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
    271 	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
    272 
    273 	punpcklbw xmmF,xmmH
    274 	punpckhbw xmmH,xmmH
    275 	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
    276 	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
    277 
    278 %endif ; RGB_PIXELSIZE ; ---------------
    279 
    280 	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
    281 	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
    282 
    283 	; (Original)
    284 	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    285 	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    286 	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    287 	;
    288 	; (This implementation)
    289 	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    290 	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    291 	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    292 
    293 	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
    294 	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
    295 	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
    296 	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
    297 
    298 	movdqa    xmm6,xmm1
    299 	punpcklwd xmm1,xmm3
    300 	punpckhwd xmm6,xmm3
    301 	movdqa    xmm7,xmm1
    302 	movdqa    xmm4,xmm6
    303 	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    304 	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    305 	pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
    306 	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
    307 
    308 	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
    309 	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
    310 
    311 	pxor      xmm1,xmm1
    312 	pxor      xmm6,xmm6
    313 	punpcklwd xmm1,xmm5		; xmm1=BOL
    314 	punpckhwd xmm6,xmm5		; xmm6=BOH
    315 	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
    316 	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
    317 
    318 	movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
    319 
    320 	paddd     xmm7,xmm1
    321 	paddd     xmm4,xmm6
    322 	paddd     xmm7,xmm5
    323 	paddd     xmm4,xmm5
    324 	psrld     xmm7,SCALEBITS	; xmm7=CbOL
    325 	psrld     xmm4,SCALEBITS	; xmm4=CbOH
    326 	packssdw  xmm7,xmm4		; xmm7=CbO
    327 
    328 	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
    329 
    330 	movdqa    xmm6,xmm0
    331 	punpcklwd xmm0,xmm2
    332 	punpckhwd xmm6,xmm2
    333 	movdqa    xmm5,xmm0
    334 	movdqa    xmm4,xmm6
    335 	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
    336 	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
    337 	pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
    338 	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
    339 
    340 	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
    341 	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
    342 
    343 	pxor      xmm0,xmm0
    344 	pxor      xmm6,xmm6
    345 	punpcklwd xmm0,xmm1		; xmm0=BEL
    346 	punpckhwd xmm6,xmm1		; xmm6=BEH
    347 	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
    348 	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
    349 
    350 	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
    351 
    352 	paddd     xmm5,xmm0
    353 	paddd     xmm4,xmm6
    354 	paddd     xmm5,xmm1
    355 	paddd     xmm4,xmm1
    356 	psrld     xmm5,SCALEBITS	; xmm5=CbEL
    357 	psrld     xmm4,SCALEBITS	; xmm4=CbEH
    358 	packssdw  xmm5,xmm4		; xmm5=CbE
    359 
    360 	psllw     xmm7,BYTE_BIT
    361 	por       xmm5,xmm7		; xmm5=Cb
    362 	movdqa    XMMWORD [rbx], xmm5	; Save Cb
    363 
    364 	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
    365 	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
    366 	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
    367 
    368 	movdqa    xmm4,xmm0
    369 	punpcklwd xmm0,xmm3
    370 	punpckhwd xmm4,xmm3
    371 	movdqa    xmm7,xmm0
    372 	movdqa    xmm5,xmm4
    373 	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    374 	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    375 	pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
    376 	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
    377 
    378 	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
    379 
    380 	paddd     xmm0, XMMWORD [wk(4)]
    381 	paddd     xmm4, XMMWORD [wk(5)]
    382 	paddd     xmm0,xmm3
    383 	paddd     xmm4,xmm3
    384 	psrld     xmm0,SCALEBITS	; xmm0=YOL
    385 	psrld     xmm4,SCALEBITS	; xmm4=YOH
    386 	packssdw  xmm0,xmm4		; xmm0=YO
    387 
    388 	pxor      xmm3,xmm3
    389 	pxor      xmm4,xmm4
    390 	punpcklwd xmm3,xmm1		; xmm3=ROL
    391 	punpckhwd xmm4,xmm1		; xmm4=ROH
    392 	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
    393 	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
    394 
    395 	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
    396 
    397 	paddd     xmm7,xmm3
    398 	paddd     xmm5,xmm4
    399 	paddd     xmm7,xmm1
    400 	paddd     xmm5,xmm1
    401 	psrld     xmm7,SCALEBITS	; xmm7=CrOL
    402 	psrld     xmm5,SCALEBITS	; xmm5=CrOH
    403 	packssdw  xmm7,xmm5		; xmm7=CrO
    404 
    405 	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
    406 
    407 	movdqa    xmm4,xmm6
    408 	punpcklwd xmm6,xmm2
    409 	punpckhwd xmm4,xmm2
    410 	movdqa    xmm1,xmm6
    411 	movdqa    xmm5,xmm4
    412 	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    413 	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    414 	pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
    415 	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
    416 
    417 	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
    418 
    419 	paddd     xmm6, XMMWORD [wk(6)]
    420 	paddd     xmm4, XMMWORD [wk(7)]
    421 	paddd     xmm6,xmm2
    422 	paddd     xmm4,xmm2
    423 	psrld     xmm6,SCALEBITS	; xmm6=YEL
    424 	psrld     xmm4,SCALEBITS	; xmm4=YEH
    425 	packssdw  xmm6,xmm4		; xmm6=YE
    426 
    427 	psllw     xmm0,BYTE_BIT
    428 	por       xmm6,xmm0		; xmm6=Y
    429 	movdqa    XMMWORD [rdi], xmm6	; Save Y
    430 
    431 	pxor      xmm2,xmm2
    432 	pxor      xmm4,xmm4
    433 	punpcklwd xmm2,xmm3		; xmm2=REL
    434 	punpckhwd xmm4,xmm3		; xmm4=REH
    435 	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
    436 	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
    437 
    438 	movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
    439 
    440 	paddd     xmm1,xmm2
    441 	paddd     xmm5,xmm4
    442 	paddd     xmm1,xmm0
    443 	paddd     xmm5,xmm0
    444 	psrld     xmm1,SCALEBITS	; xmm1=CrEL
    445 	psrld     xmm5,SCALEBITS	; xmm5=CrEH
    446 	packssdw  xmm1,xmm5		; xmm1=CrE
    447 
    448 	psllw     xmm7,BYTE_BIT
    449 	por       xmm1,xmm7		; xmm1=Cr
    450 	movdqa    XMMWORD [rdx], xmm1	; Save Cr
    451 
    452 	sub	rcx, byte SIZEOF_XMMWORD
    453 	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
    454 	add	rdi, byte SIZEOF_XMMWORD		; outptr0
    455 	add	rbx, byte SIZEOF_XMMWORD		; outptr1
    456 	add	rdx, byte SIZEOF_XMMWORD		; outptr2
    457 	cmp	rcx, byte SIZEOF_XMMWORD
    458 	jae	near .columnloop
    459 	test	rcx,rcx
    460 	jnz	near .column_ld1
    461 
    462 	pop	rcx			; col
    463 	pop	rsi
    464 	pop	rdi
    465 	pop	rbx
    466 	pop	rdx
    467 
    468 	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
    469 	add	rdi, byte SIZEOF_JSAMPROW
    470 	add	rbx, byte SIZEOF_JSAMPROW
    471 	add	rdx, byte SIZEOF_JSAMPROW
    472 	dec	rax				; num_rows
    473 	jg	near .rowloop
    474 
    475 .return:
    476 	pop	rbx
    477 	uncollect_args
    478 	mov	rsp,rbp		; rsp <- aligned rbp
    479 	pop	rsp		; rsp <- original rbp
    480 	pop	rbp
    481 	ret
    482 
    483 ; For some reason, the OS X linker does not honor the request to align the
    484 ; segment unless we do this.
    485 	align	16
    486