Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcclrmmx.asm - colorspace conversion (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jcolsamp.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 ;
     23 ; Convert some rows of samples to the output colorspace.
     24 ;
     25 ; GLOBAL(void)
     26 ; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
     27 ;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
     28 ;                           JDIMENSION output_row, int num_rows);
     29 ;
     30 
     31 %define img_width(b)	(b)+8			; JDIMENSION img_width
     32 %define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
     33 %define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
     34 %define output_row(b)	(b)+20		; JDIMENSION output_row
     35 %define num_rows(b)	(b)+24		; int num_rows
     36 
     37 %define original_ebp	ebp+0
     38 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
     39 %define WK_NUM		8
     40 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
     41 
     42 	align	16
     43 	global	EXTN(jsimd_rgb_ycc_convert_mmx) PRIVATE
     44 
     45 EXTN(jsimd_rgb_ycc_convert_mmx):
     46 	push	ebp
     47 	mov	eax,esp				; eax = original ebp
     48 	sub	esp, byte 4
     49 	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
     50 	mov	[esp],eax
     51 	mov	ebp,esp				; ebp = aligned ebp
     52 	lea	esp, [wk(0)]
     53 	pushpic	eax		; make a room for GOT address
     54 	push	ebx
     55 ;	push	ecx		; need not be preserved
     56 ;	push	edx		; need not be preserved
     57 	push	esi
     58 	push	edi
     59 
     60 	get_GOT	ebx			; get GOT address
     61 	movpic	POINTER [gotptr], ebx	; save GOT address
     62 
     63 	mov	ecx, JDIMENSION [img_width(eax)]	; num_cols
     64 	test	ecx,ecx
     65 	jz	near .return
     66 
     67 	push	ecx
     68 
     69 	mov	esi, JSAMPIMAGE [output_buf(eax)]
     70 	mov	ecx, JDIMENSION [output_row(eax)]
     71 	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
     72 	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
     73 	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
     74 	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
     75 	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
     76 	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
     77 
     78 	pop	ecx
     79 
     80 	mov	esi, JSAMPARRAY [input_buf(eax)]
     81 	mov	eax, INT [num_rows(eax)]
     82 	test	eax,eax
     83 	jle	near .return
     84 	alignx	16,7
     85 .rowloop:
     86 	pushpic	eax
     87 	push	edx
     88 	push	ebx
     89 	push	edi
     90 	push	esi
     91 	push	ecx			; col
     92 
     93 	mov	esi, JSAMPROW [esi]	; inptr
     94 	mov	edi, JSAMPROW [edi]	; outptr0
     95 	mov	ebx, JSAMPROW [ebx]	; outptr1
     96 	mov	edx, JSAMPROW [edx]	; outptr2
     97 	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
     98 
     99 	cmp	ecx, byte SIZEOF_MMWORD
    100 	jae	short .columnloop
    101 	alignx	16,7
    102 
    103 %if RGB_PIXELSIZE == 3 ; ---------------
    104 
    105 .column_ld1:
    106 	push	eax
    107 	push	edx
    108 	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
    109 	test	cl, SIZEOF_BYTE
    110 	jz	short .column_ld2
    111 	sub	ecx, byte SIZEOF_BYTE
    112 	xor	eax,eax
    113 	mov	al, BYTE [esi+ecx]
    114 .column_ld2:
    115 	test	cl, SIZEOF_WORD
    116 	jz	short .column_ld4
    117 	sub	ecx, byte SIZEOF_WORD
    118 	xor	edx,edx
    119 	mov	dx, WORD [esi+ecx]
    120 	shl	eax, WORD_BIT
    121 	or	eax,edx
    122 .column_ld4:
    123 	movd	mmA,eax
    124 	pop	edx
    125 	pop	eax
    126 	test	cl, SIZEOF_DWORD
    127 	jz	short .column_ld8
    128 	sub	ecx, byte SIZEOF_DWORD
    129 	movd	mmG, DWORD [esi+ecx]
    130 	psllq	mmA, DWORD_BIT
    131 	por	mmA,mmG
    132 .column_ld8:
    133 	test	cl, SIZEOF_MMWORD
    134 	jz	short .column_ld16
    135 	movq	mmG,mmA
    136 	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    137 	mov	ecx, SIZEOF_MMWORD
    138 	jmp	short .rgb_ycc_cnv
    139 .column_ld16:
    140 	test	cl, 2*SIZEOF_MMWORD
    141 	mov	ecx, SIZEOF_MMWORD
    142 	jz	short .rgb_ycc_cnv
    143 	movq	mmF,mmA
    144 	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    145 	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
    146 	jmp	short .rgb_ycc_cnv
    147 	alignx	16,7
    148 
    149 .columnloop:
    150 	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    151 	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
    152 	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
    153 
    154 .rgb_ycc_cnv:
    155 	; mmA=(00 10 20 01 11 21 02 12)
    156 	; mmG=(22 03 13 23 04 14 24 05)
    157 	; mmF=(15 25 06 16 26 07 17 27)
    158 
    159 	movq      mmD,mmA
    160 	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
    161 	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
    162 
    163 	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
    164 	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
    165 
    166 	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
    167 	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
    168 
    169 	movq      mmE,mmA
    170 	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
    171 	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
    172 
    173 	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
    174 	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
    175 
    176 	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
    177 	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
    178 
    179 	pxor      mmH,mmH
    180 
    181 	movq      mmC,mmA
    182 	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
    183 	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
    184 
    185 	movq      mmB,mmE
    186 	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
    187 	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
    188 
    189 	movq      mmF,mmD
    190 	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
    191 	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
    192 
    193 %else ; RGB_PIXELSIZE == 4 ; -----------
    194 
    195 .column_ld1:
    196 	test	cl, SIZEOF_MMWORD/8
    197 	jz	short .column_ld2
    198 	sub	ecx, byte SIZEOF_MMWORD/8
    199 	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
    200 .column_ld2:
    201 	test	cl, SIZEOF_MMWORD/4
    202 	jz	short .column_ld4
    203 	sub	ecx, byte SIZEOF_MMWORD/4
    204 	movq	mmF,mmA
    205 	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
    206 .column_ld4:
    207 	test	cl, SIZEOF_MMWORD/2
    208 	mov	ecx, SIZEOF_MMWORD
    209 	jz	short .rgb_ycc_cnv
    210 	movq	mmD,mmA
    211 	movq	mmC,mmF
    212 	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    213 	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
    214 	jmp	short .rgb_ycc_cnv
    215 	alignx	16,7
    216 
    217 .columnloop:
    218 	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    219 	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
    220 	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
    221 	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
    222 
    223 .rgb_ycc_cnv:
    224 	; mmA=(00 10 20 30 01 11 21 31)
    225 	; mmF=(02 12 22 32 03 13 23 33)
    226 	; mmD=(04 14 24 34 05 15 25 35)
    227 	; mmC=(06 16 26 36 07 17 27 37)
    228 
    229 	movq      mmB,mmA
    230 	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
    231 	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
    232 
    233 	movq      mmG,mmD
    234 	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
    235 	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
    236 
    237 	movq      mmE,mmA
    238 	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
    239 	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
    240 
    241 	movq      mmH,mmB
    242 	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
    243 	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
    244 
    245 	pxor      mmF,mmF
    246 
    247 	movq      mmC,mmA
    248 	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
    249 	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
    250 
    251 	movq      mmD,mmB
    252 	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
    253 	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
    254 
    255 	movq      mmG,mmE
    256 	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
    257 	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
    258 
    259 	punpcklbw mmF,mmH
    260 	punpckhbw mmH,mmH
    261 	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
    262 	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
    263 
    264 %endif ; RGB_PIXELSIZE ; ---------------
    265 
    266 	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
    267 	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
    268 
    269 	; (Original)
    270 	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    271 	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    272 	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    273 	;
    274 	; (This implementation)
    275 	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    276 	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    277 	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    278 
    279 	movq      MMWORD [wk(0)], mm0	; wk(0)=RE
    280 	movq      MMWORD [wk(1)], mm1	; wk(1)=RO
    281 	movq      MMWORD [wk(2)], mm4	; wk(2)=BE
    282 	movq      MMWORD [wk(3)], mm5	; wk(3)=BO
    283 
    284 	movq      mm6,mm1
    285 	punpcklwd mm1,mm3
    286 	punpckhwd mm6,mm3
    287 	movq      mm7,mm1
    288 	movq      mm4,mm6
    289 	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    290 	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    291 	pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
    292 	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
    293 
    294 	movq      MMWORD [wk(4)], mm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
    295 	movq      MMWORD [wk(5)], mm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
    296 
    297 	pxor      mm1,mm1
    298 	pxor      mm6,mm6
    299 	punpcklwd mm1,mm5		; mm1=BOL
    300 	punpckhwd mm6,mm5		; mm6=BOH
    301 	psrld     mm1,1			; mm1=BOL*FIX(0.500)
    302 	psrld     mm6,1			; mm6=BOH*FIX(0.500)
    303 
    304 	movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
    305 
    306 	paddd     mm7,mm1
    307 	paddd     mm4,mm6
    308 	paddd     mm7,mm5
    309 	paddd     mm4,mm5
    310 	psrld     mm7,SCALEBITS		; mm7=CbOL
    311 	psrld     mm4,SCALEBITS		; mm4=CbOH
    312 	packssdw  mm7,mm4		; mm7=CbO
    313 
    314 	movq      mm1, MMWORD [wk(2)]	; mm1=BE
    315 
    316 	movq      mm6,mm0
    317 	punpcklwd mm0,mm2
    318 	punpckhwd mm6,mm2
    319 	movq      mm5,mm0
    320 	movq      mm4,mm6
    321 	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
    322 	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
    323 	pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
    324 	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
    325 
    326 	movq      MMWORD [wk(6)], mm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
    327 	movq      MMWORD [wk(7)], mm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
    328 
    329 	pxor      mm0,mm0
    330 	pxor      mm6,mm6
    331 	punpcklwd mm0,mm1		; mm0=BEL
    332 	punpckhwd mm6,mm1		; mm6=BEH
    333 	psrld     mm0,1			; mm0=BEL*FIX(0.500)
    334 	psrld     mm6,1			; mm6=BEH*FIX(0.500)
    335 
    336 	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
    337 
    338 	paddd     mm5,mm0
    339 	paddd     mm4,mm6
    340 	paddd     mm5,mm1
    341 	paddd     mm4,mm1
    342 	psrld     mm5,SCALEBITS		; mm5=CbEL
    343 	psrld     mm4,SCALEBITS		; mm4=CbEH
    344 	packssdw  mm5,mm4		; mm5=CbE
    345 
    346 	psllw     mm7,BYTE_BIT
    347 	por       mm5,mm7		; mm5=Cb
    348 	movq      MMWORD [ebx], mm5	; Save Cb
    349 
    350 	movq      mm0, MMWORD [wk(3)]	; mm0=BO
    351 	movq      mm6, MMWORD [wk(2)]	; mm6=BE
    352 	movq      mm1, MMWORD [wk(1)]	; mm1=RO
    353 
    354 	movq      mm4,mm0
    355 	punpcklwd mm0,mm3
    356 	punpckhwd mm4,mm3
    357 	movq      mm7,mm0
    358 	movq      mm5,mm4
    359 	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    360 	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    361 	pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
    362 	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
    363 
    364 	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
    365 
    366 	paddd     mm0, MMWORD [wk(4)]
    367 	paddd     mm4, MMWORD [wk(5)]
    368 	paddd     mm0,mm3
    369 	paddd     mm4,mm3
    370 	psrld     mm0,SCALEBITS		; mm0=YOL
    371 	psrld     mm4,SCALEBITS		; mm4=YOH
    372 	packssdw  mm0,mm4		; mm0=YO
    373 
    374 	pxor      mm3,mm3
    375 	pxor      mm4,mm4
    376 	punpcklwd mm3,mm1		; mm3=ROL
    377 	punpckhwd mm4,mm1		; mm4=ROH
    378 	psrld     mm3,1			; mm3=ROL*FIX(0.500)
    379 	psrld     mm4,1			; mm4=ROH*FIX(0.500)
    380 
    381 	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
    382 
    383 	paddd     mm7,mm3
    384 	paddd     mm5,mm4
    385 	paddd     mm7,mm1
    386 	paddd     mm5,mm1
    387 	psrld     mm7,SCALEBITS		; mm7=CrOL
    388 	psrld     mm5,SCALEBITS		; mm5=CrOH
    389 	packssdw  mm7,mm5		; mm7=CrO
    390 
    391 	movq      mm3, MMWORD [wk(0)]	; mm3=RE
    392 
    393 	movq      mm4,mm6
    394 	punpcklwd mm6,mm2
    395 	punpckhwd mm4,mm2
    396 	movq      mm1,mm6
    397 	movq      mm5,mm4
    398 	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    399 	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    400 	pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
    401 	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
    402 
    403 	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
    404 
    405 	paddd     mm6, MMWORD [wk(6)]
    406 	paddd     mm4, MMWORD [wk(7)]
    407 	paddd     mm6,mm2
    408 	paddd     mm4,mm2
    409 	psrld     mm6,SCALEBITS		; mm6=YEL
    410 	psrld     mm4,SCALEBITS		; mm4=YEH
    411 	packssdw  mm6,mm4		; mm6=YE
    412 
    413 	psllw     mm0,BYTE_BIT
    414 	por       mm6,mm0		; mm6=Y
    415 	movq      MMWORD [edi], mm6	; Save Y
    416 
    417 	pxor      mm2,mm2
    418 	pxor      mm4,mm4
    419 	punpcklwd mm2,mm3		; mm2=REL
    420 	punpckhwd mm4,mm3		; mm4=REH
    421 	psrld     mm2,1			; mm2=REL*FIX(0.500)
    422 	psrld     mm4,1			; mm4=REH*FIX(0.500)
    423 
    424 	movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
    425 
    426 	paddd     mm1,mm2
    427 	paddd     mm5,mm4
    428 	paddd     mm1,mm0
    429 	paddd     mm5,mm0
    430 	psrld     mm1,SCALEBITS		; mm1=CrEL
    431 	psrld     mm5,SCALEBITS		; mm5=CrEH
    432 	packssdw  mm1,mm5		; mm1=CrE
    433 
    434 	psllw     mm7,BYTE_BIT
    435 	por       mm1,mm7		; mm1=Cr
    436 	movq      MMWORD [edx], mm1	; Save Cr
    437 
    438 	sub	ecx, byte SIZEOF_MMWORD
    439 	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
    440 	add	edi, byte SIZEOF_MMWORD			; outptr0
    441 	add	ebx, byte SIZEOF_MMWORD			; outptr1
    442 	add	edx, byte SIZEOF_MMWORD			; outptr2
    443 	cmp	ecx, byte SIZEOF_MMWORD
    444 	jae	near .columnloop
    445 	test	ecx,ecx
    446 	jnz	near .column_ld1
    447 
    448 	pop	ecx			; col
    449 	pop	esi
    450 	pop	edi
    451 	pop	ebx
    452 	pop	edx
    453 	poppic	eax
    454 
    455 	add	esi, byte SIZEOF_JSAMPROW	; input_buf
    456 	add	edi, byte SIZEOF_JSAMPROW
    457 	add	ebx, byte SIZEOF_JSAMPROW
    458 	add	edx, byte SIZEOF_JSAMPROW
    459 	dec	eax				; num_rows
    460 	jg	near .rowloop
    461 
    462 	emms		; empty MMX state
    463 
    464 .return:
    465 	pop	edi
    466 	pop	esi
    467 ;	pop	edx		; need not be preserved
    468 ;	pop	ecx		; need not be preserved
    469 	pop	ebx
    470 	mov	esp,ebp		; esp <- aligned ebp
    471 	pop	esp		; esp <- original ebp
    472 	pop	ebp
    473 	ret
    474 
    475 ; For some reason, the OS X linker does not honor the request to align the
    476 ; segment unless we do this.
    477 	align	16
    478