Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
      3 ;
      4 ; Copyright 2009, 2012 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright 2009, 2012 D. R. Commander
      6 ;
      7 ; Based on
      8 ; x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler),
     13 ; can *not* be assembled with Microsoft's MASM or any compatible
     14 ; assembler (including Borland's Turbo Assembler).
     15 ; NASM is available from http://nasm.sourceforge.net/ or
     16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     17 ;
     18 ; [TAB8]
     19 
     20 %include "jcolsamp.inc"
     21 				
     22 ; --------------------------------------------------------------------------
     23 ;
     24 ; Convert some rows of samples to the output colorspace.
     25 ;
     26 ; GLOBAL(void)
     27 ; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
     28 ;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
     29 ;                             JSAMPARRAY output_buf, int num_rows)
     30 ;
     31 
     32 ; r10 = JDIMENSION out_width
     33 ; r11 = JSAMPIMAGE input_buf
     34 ; r12 = JDIMENSION input_row
     35 ; r13 = JSAMPARRAY output_buf
     36 ; r14 = int num_rows
     37 
     38 %define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
     39 %define WK_NUM		2
     40 
     41 	align	16
     42 	global	EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
     43 
     44 EXTN(jsimd_ycc_rgb_convert_sse2):
     45 	push	rbp
     46 	mov	rax,rsp				; rax = original rbp
     47 	sub	rsp, byte 4
     48 	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
     49 	mov	[rsp],rax
     50 	mov	rbp,rsp				; rbp = aligned rbp
     51 	lea	rsp, [wk(0)]
     52 	collect_args
     53 	push	rbx
     54 
     55 	mov	rcx, r10	; num_cols
     56 	test	rcx,rcx
     57 	jz	near .return
     58 
     59 	push	rcx
     60 
     61 	mov	rdi, r11
     62 	mov	rcx, r12
     63 	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
     64 	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
     65 	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     66 	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
     67 	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     68 	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
     69 
     70 	pop	rcx
     71 
     72 	mov	rdi, r13
     73 	mov	eax, r14d
     74 	test	rax,rax
     75 	jle	near .return
     76 .rowloop:
     77 	push	rax
     78 	push	rdi
     79 	push	rdx
     80 	push	rbx
     81 	push	rsi
     82 	push	rcx			; col
     83 
     84 	mov	rsi, JSAMPROW [rsi]	; inptr0
     85 	mov	rbx, JSAMPROW [rbx]	; inptr1
     86 	mov	rdx, JSAMPROW [rdx]	; inptr2
     87 	mov	rdi, JSAMPROW [rdi]	; outptr
     88 .columnloop:
     89 
     90 	movdqa	xmm5, XMMWORD [rbx]	; xmm5=Cb(0123456789ABCDEF)
     91 	movdqa	xmm1, XMMWORD [rdx]	; xmm1=Cr(0123456789ABCDEF)
     92 
     93 	pcmpeqw	xmm4,xmm4
     94 	pcmpeqw	xmm7,xmm7
     95 	psrlw	xmm4,BYTE_BIT
     96 	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
     97 	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
     98 
     99 	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
    100 	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
    101 	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
    102 	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
    103 
    104 	paddw	xmm4,xmm7
    105 	paddw	xmm5,xmm7
    106 	paddw	xmm0,xmm7
    107 	paddw	xmm1,xmm7
    108 
    109 	; (Original)
    110 	; R = Y                + 1.40200 * Cr
    111 	; G = Y - 0.34414 * Cb - 0.71414 * Cr
    112 	; B = Y + 1.77200 * Cb
    113 	;
    114 	; (This implementation)
    115 	; R = Y                + 0.40200 * Cr + Cr
    116 	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
    117 	; B = Y - 0.22800 * Cb + Cb + Cb
    118 
    119 	movdqa	xmm2,xmm4		; xmm2=CbE
    120 	movdqa	xmm3,xmm5		; xmm3=CbO
    121 	paddw	xmm4,xmm4		; xmm4=2*CbE
    122 	paddw	xmm5,xmm5		; xmm5=2*CbO
    123 	movdqa	xmm6,xmm0		; xmm6=CrE
    124 	movdqa	xmm7,xmm1		; xmm7=CrO
    125 	paddw	xmm0,xmm0		; xmm0=2*CrE
    126 	paddw	xmm1,xmm1		; xmm1=2*CrO
    127 
    128 	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbE * -FIX(0.22800))
    129 	pmulhw	xmm5,[rel PW_MF0228]	; xmm5=(2*CbO * -FIX(0.22800))
    130 	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrE * FIX(0.40200))
    131 	pmulhw	xmm1,[rel PW_F0402]	; xmm1=(2*CrO * FIX(0.40200))
    132 
    133 	paddw	xmm4,[rel PW_ONE]
    134 	paddw	xmm5,[rel PW_ONE]
    135 	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
    136 	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
    137 	paddw	xmm0,[rel PW_ONE]
    138 	paddw	xmm1,[rel PW_ONE]
    139 	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
    140 	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
    141 
    142 	paddw	xmm4,xmm2
    143 	paddw	xmm5,xmm3
    144 	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
    145 	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
    146 	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
    147 	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
    148 
    149 	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
    150 	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
    151 
    152 	movdqa    xmm4,xmm2
    153 	movdqa    xmm5,xmm3
    154 	punpcklwd xmm2,xmm6
    155 	punpckhwd xmm4,xmm6
    156 	pmaddwd   xmm2,[rel PW_MF0344_F0285]
    157 	pmaddwd   xmm4,[rel PW_MF0344_F0285]
    158 	punpcklwd xmm3,xmm7
    159 	punpckhwd xmm5,xmm7
    160 	pmaddwd   xmm3,[rel PW_MF0344_F0285]
    161 	pmaddwd   xmm5,[rel PW_MF0344_F0285]
    162 
    163 	paddd     xmm2,[rel PD_ONEHALF]
    164 	paddd     xmm4,[rel PD_ONEHALF]
    165 	psrad     xmm2,SCALEBITS
    166 	psrad     xmm4,SCALEBITS
    167 	paddd     xmm3,[rel PD_ONEHALF]
    168 	paddd     xmm5,[rel PD_ONEHALF]
    169 	psrad     xmm3,SCALEBITS
    170 	psrad     xmm5,SCALEBITS
    171 
    172 	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
    173 	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
    174 	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
    175 	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
    176 
    177 	movdqa    xmm5, XMMWORD [rsi]	; xmm5=Y(0123456789ABCDEF)
    178 
    179 	pcmpeqw   xmm4,xmm4
    180 	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
    181 	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
    182 	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
    183 
    184 	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
    185 	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
    186 	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
    187 	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
    188 
    189 	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
    190 	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
    191 	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
    192 	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
    193 
    194 	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
    195 	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
    196 	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
    197 	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
    198 
    199 %if RGB_PIXELSIZE == 3 ; ---------------
    200 
    201 	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
    202 	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
    203 	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
    204 	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
    205 
    206 	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
    207 	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
    208 	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
    209 
    210 	movdqa    xmmG,xmmA
    211 	movdqa    xmmH,xmmA
    212 	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
    213 	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
    214 
    215 	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
    216 	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
    217 
    218 	movdqa    xmmC,xmmD
    219 	movdqa    xmmB,xmmD
    220 	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
    221 	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
    222 
    223 	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
    224 
    225 	movdqa    xmmF,xmmE
    226 	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
    227 	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
    228 
    229 	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
    230 	movdqa    xmmB,xmmE
    231 	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
    232 	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
    233 	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
    234 
    235 	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
    236 	movdqa    xmmB,xmmF
    237 	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
    238 	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
    239 	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
    240 
    241 	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
    242 	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    243 	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
    244 
    245 	cmp	rcx, byte SIZEOF_XMMWORD
    246 	jb	short .column_st32
    247 
    248 	test	rdi, SIZEOF_XMMWORD-1
    249 	jnz	short .out1
    250 	; --(aligned)-------------------
    251 	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
    252 	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
    253 	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
    254 	jmp	short .out0
    255 .out1:	; --(unaligned)-----------------
    256 	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
    257 	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
    258 	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
    259 .out0:
    260 	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
    261 	sub	rcx, byte SIZEOF_XMMWORD
    262 	jz	near .nextrow
    263 
    264 	add	rsi, byte SIZEOF_XMMWORD	; inptr0
    265 	add	rbx, byte SIZEOF_XMMWORD	; inptr1
    266 	add	rdx, byte SIZEOF_XMMWORD	; inptr2
    267 	jmp	near .columnloop
    268 
    269 .column_st32:
    270 	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
    271 	cmp	rcx, byte 2*SIZEOF_XMMWORD
    272 	jb	short .column_st16
    273 	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
    274 	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
    275 	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
    276 	movdqa	xmmA,xmmF
    277 	sub	rcx, byte 2*SIZEOF_XMMWORD
    278 	jmp	short .column_st15
    279 .column_st16:
    280 	cmp	rcx, byte SIZEOF_XMMWORD
    281 	jb	short .column_st15
    282 	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
    283 	add	rdi, byte SIZEOF_XMMWORD	; outptr
    284 	movdqa	xmmA,xmmD
    285 	sub	rcx, byte SIZEOF_XMMWORD
    286 .column_st15:
    287 	; Store the lower 8 bytes of xmmA to the output when it has enough
    288 	; space.
    289 	cmp	rcx, byte SIZEOF_MMWORD
    290 	jb	short .column_st7
    291 	movq	XMM_MMWORD [rdi], xmmA
    292 	add	rdi, byte SIZEOF_MMWORD
    293 	sub	rcx, byte SIZEOF_MMWORD
    294 	psrldq	xmmA, SIZEOF_MMWORD
    295 .column_st7:
    296 	; Store the lower 4 bytes of xmmA to the output when it has enough
    297 	; space.
    298 	cmp	rcx, byte SIZEOF_DWORD
    299 	jb	short .column_st3
    300 	movd	XMM_DWORD [rdi], xmmA
    301 	add	rdi, byte SIZEOF_DWORD
    302 	sub	rcx, byte SIZEOF_DWORD
    303 	psrldq	xmmA, SIZEOF_DWORD
    304 .column_st3:
    305 	; Store the lower 2 bytes of rax to the output when it has enough
    306 	; space.
    307 	movd	eax, xmmA
    308 	cmp	rcx, byte SIZEOF_WORD
    309 	jb	short .column_st1
    310 	mov	WORD [rdi], ax
    311 	add	rdi, byte SIZEOF_WORD
    312 	sub	rcx, byte SIZEOF_WORD
    313 	shr	rax, 16
    314 .column_st1:
    315 	; Store the lower 1 byte of rax to the output when it has enough
    316 	; space.
    317 	test	rcx, rcx
    318 	jz	short .nextrow
    319 	mov	BYTE [rdi], al
    320 
    321 %else ; RGB_PIXELSIZE == 4 ; -----------
    322 
    323 %ifdef RGBX_FILLER_0XFF
    324 	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
    325 	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
    326 %else
    327 	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
    328 	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
    329 %endif
    330 	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
    331 	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
    332 	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
    333 	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
    334 
    335 	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
    336 	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
    337 	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
    338 	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
    339 
    340 	movdqa    xmmC,xmmA
    341 	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
    342 	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
    343 	movdqa    xmmG,xmmB
    344 	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
    345 	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
    346 
    347 	movdqa    xmmD,xmmA
    348 	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
    349 	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    350 	movdqa    xmmH,xmmC
    351 	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
    352 	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    353 
    354 	cmp	rcx, byte SIZEOF_XMMWORD
    355 	jb	short .column_st32
    356 
    357 	test	rdi, SIZEOF_XMMWORD-1
    358 	jnz	short .out1
    359 	; --(aligned)-------------------
    360 	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
    361 	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
    362 	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
    363 	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
    364 	jmp	short .out0
    365 .out1:	; --(unaligned)-----------------
    366 	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
    367 	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
    368 	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
    369 	movdqu	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
    370 .out0:
    371 	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
    372 	sub	rcx, byte SIZEOF_XMMWORD
    373 	jz	near .nextrow
    374 
    375 	add	rsi, byte SIZEOF_XMMWORD	; inptr0
    376 	add	rbx, byte SIZEOF_XMMWORD	; inptr1
    377 	add	rdx, byte SIZEOF_XMMWORD	; inptr2
    378 	jmp	near .columnloop
    379 
    380 .column_st32:
    381 	cmp	rcx, byte SIZEOF_XMMWORD/2
    382 	jb	short .column_st16
    383 	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
    384 	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
    385 	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
    386 	movdqa	xmmA,xmmC
    387 	movdqa	xmmD,xmmH
    388 	sub	rcx, byte SIZEOF_XMMWORD/2
    389 .column_st16:
    390 	cmp	rcx, byte SIZEOF_XMMWORD/4
    391 	jb	short .column_st15
    392 	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
    393 	add	rdi, byte SIZEOF_XMMWORD	; outptr
    394 	movdqa	xmmA,xmmD
    395 	sub	rcx, byte SIZEOF_XMMWORD/4
    396 .column_st15:
    397 	; Store two pixels (8 bytes) of xmmA to the output when it has enough
    398 	; space.
    399 	cmp	rcx, byte SIZEOF_XMMWORD/8
    400 	jb	short .column_st7
    401 	movq	MMWORD [rdi], xmmA
    402 	add	rdi, byte SIZEOF_XMMWORD/8*4
    403 	sub	rcx, byte SIZEOF_XMMWORD/8
    404 	psrldq	xmmA, SIZEOF_XMMWORD/8*4
    405 .column_st7:
    406 	; Store one pixel (4 bytes) of xmmA to the output when it has enough
    407 	; space.
    408 	test	rcx, rcx
    409 	jz	short .nextrow
    410 	movd	XMM_DWORD [rdi], xmmA
    411 
    412 %endif ; RGB_PIXELSIZE ; ---------------
    413 
    414 .nextrow:
    415 	pop	rcx
    416 	pop	rsi
    417 	pop	rbx
    418 	pop	rdx
    419 	pop	rdi
    420 	pop	rax
    421 
    422 	add	rsi, byte SIZEOF_JSAMPROW
    423 	add	rbx, byte SIZEOF_JSAMPROW
    424 	add	rdx, byte SIZEOF_JSAMPROW
    425 	add	rdi, byte SIZEOF_JSAMPROW	; output_buf
    426 	dec	rax				; num_rows
    427 	jg	near .rowloop
    428 
    429 	sfence		; flush the write buffer
    430 
    431 .return:
    432 	pop	rbx
    433 	uncollect_args
    434 	mov	rsp,rbp		; rsp <- aligned rbp
    435 	pop	rsp		; rsp <- original rbp
    436 	pop	rbp
    437 	ret
    438 
    439 ; For some reason, the OS X linker does not honor the request to align the
    440 ; segment unless we do this.
    441 	align	16
    442