Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jdclrmmx.asm - colorspace conversion (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jcolsamp.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 ;
     23 ; Convert some rows of samples to the output colorspace.
     24 ;
     25 ; GLOBAL(void)
     26 ; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
     27 ;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
     28 ;                            JSAMPARRAY output_buf, int num_rows)
     29 ;
     30 
     31 %define out_width(b)	(b)+8			; JDIMENSION out_width
     32 %define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
     33 %define input_row(b)	(b)+16		; JDIMENSION input_row
     34 %define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
     35 %define num_rows(b)	(b)+24		; int num_rows
     36 
     37 %define original_ebp	ebp+0
     38 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
     39 %define WK_NUM		2
     40 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
     41 
     42 	align	16
     43 	global	EXTN(jsimd_ycc_rgb_convert_mmx) PRIVATE
     44 
     45 EXTN(jsimd_ycc_rgb_convert_mmx):
     46 	push	ebp
     47 	mov	eax,esp				; eax = original ebp
     48 	sub	esp, byte 4
     49 	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
     50 	mov	[esp],eax
     51 	mov	ebp,esp				; ebp = aligned ebp
     52 	lea	esp, [wk(0)]
     53 	pushpic	eax		; make a room for GOT address
     54 	push	ebx
     55 ;	push	ecx		; need not be preserved
     56 ;	push	edx		; need not be preserved
     57 	push	esi
     58 	push	edi
     59 
     60 	get_GOT	ebx			; get GOT address
     61 	movpic	POINTER [gotptr], ebx	; save GOT address
     62 
     63 	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
     64 	test	ecx,ecx
     65 	jz	near .return
     66 
     67 	push	ecx
     68 
     69 	mov	edi, JSAMPIMAGE [input_buf(eax)]
     70 	mov	ecx, JDIMENSION [input_row(eax)]
     71 	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
     72 	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
     73 	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
     74 	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
     75 	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
     76 	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
     77 
     78 	pop	ecx
     79 
     80 	mov	edi, JSAMPARRAY [output_buf(eax)]
     81 	mov	eax, INT [num_rows(eax)]
     82 	test	eax,eax
     83 	jle	near .return
     84 	alignx	16,7
     85 .rowloop:
     86 	push	eax
     87 	push	edi
     88 	push	edx
     89 	push	ebx
     90 	push	esi
     91 	push	ecx			; col
     92 
     93 	mov	esi, JSAMPROW [esi]	; inptr0
     94 	mov	ebx, JSAMPROW [ebx]	; inptr1
     95 	mov	edx, JSAMPROW [edx]	; inptr2
     96 	mov	edi, JSAMPROW [edi]	; outptr
     97 	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
     98 	alignx	16,7
     99 .columnloop:
    100 
    101 	movq	mm5, MMWORD [ebx]	; mm5=Cb(01234567)
    102 	movq	mm1, MMWORD [edx]	; mm1=Cr(01234567)
    103 
    104 	pcmpeqw	mm4,mm4
    105 	pcmpeqw	mm7,mm7
    106 	psrlw	mm4,BYTE_BIT
    107 	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
    108 	movq	mm0,mm4			; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
    109 
    110 	pand	mm4,mm5			; mm4=Cb(0246)=CbE
    111 	psrlw	mm5,BYTE_BIT		; mm5=Cb(1357)=CbO
    112 	pand	mm0,mm1			; mm0=Cr(0246)=CrE
    113 	psrlw	mm1,BYTE_BIT		; mm1=Cr(1357)=CrO
    114 
    115 	paddw	mm4,mm7
    116 	paddw	mm5,mm7
    117 	paddw	mm0,mm7
    118 	paddw	mm1,mm7
    119 
    120 	; (Original)
    121 	; R = Y                + 1.40200 * Cr
    122 	; G = Y - 0.34414 * Cb - 0.71414 * Cr
    123 	; B = Y + 1.77200 * Cb
    124 	;
    125 	; (This implementation)
    126 	; R = Y                + 0.40200 * Cr + Cr
    127 	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
    128 	; B = Y - 0.22800 * Cb + Cb + Cb
    129 
    130 	movq	mm2,mm4			; mm2=CbE
    131 	movq	mm3,mm5			; mm3=CbO
    132 	paddw	mm4,mm4			; mm4=2*CbE
    133 	paddw	mm5,mm5			; mm5=2*CbO
    134 	movq	mm6,mm0			; mm6=CrE
    135 	movq	mm7,mm1			; mm7=CrO
    136 	paddw	mm0,mm0			; mm0=2*CrE
    137 	paddw	mm1,mm1			; mm1=2*CrO
    138 
    139 	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbE * -FIX(0.22800))
    140 	pmulhw	mm5,[GOTOFF(eax,PW_MF0228)]	; mm5=(2*CbO * -FIX(0.22800))
    141 	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrE * FIX(0.40200))
    142 	pmulhw	mm1,[GOTOFF(eax,PW_F0402)]	; mm1=(2*CrO * FIX(0.40200))
    143 
    144 	paddw	mm4,[GOTOFF(eax,PW_ONE)]
    145 	paddw	mm5,[GOTOFF(eax,PW_ONE)]
    146 	psraw	mm4,1			; mm4=(CbE * -FIX(0.22800))
    147 	psraw	mm5,1			; mm5=(CbO * -FIX(0.22800))
    148 	paddw	mm0,[GOTOFF(eax,PW_ONE)]
    149 	paddw	mm1,[GOTOFF(eax,PW_ONE)]
    150 	psraw	mm0,1			; mm0=(CrE * FIX(0.40200))
    151 	psraw	mm1,1			; mm1=(CrO * FIX(0.40200))
    152 
    153 	paddw	mm4,mm2
    154 	paddw	mm5,mm3
    155 	paddw	mm4,mm2			; mm4=(CbE * FIX(1.77200))=(B-Y)E
    156 	paddw	mm5,mm3			; mm5=(CbO * FIX(1.77200))=(B-Y)O
    157 	paddw	mm0,mm6			; mm0=(CrE * FIX(1.40200))=(R-Y)E
    158 	paddw	mm1,mm7			; mm1=(CrO * FIX(1.40200))=(R-Y)O
    159 
    160 	movq	MMWORD [wk(0)], mm4	; wk(0)=(B-Y)E
    161 	movq	MMWORD [wk(1)], mm5	; wk(1)=(B-Y)O
    162 
    163 	movq      mm4,mm2
    164 	movq      mm5,mm3
    165 	punpcklwd mm2,mm6
    166 	punpckhwd mm4,mm6
    167 	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
    168 	pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
    169 	punpcklwd mm3,mm7
    170 	punpckhwd mm5,mm7
    171 	pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
    172 	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
    173 
    174 	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
    175 	paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
    176 	psrad     mm2,SCALEBITS
    177 	psrad     mm4,SCALEBITS
    178 	paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
    179 	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
    180 	psrad     mm3,SCALEBITS
    181 	psrad     mm5,SCALEBITS
    182 
    183 	packssdw  mm2,mm4	; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
    184 	packssdw  mm3,mm5	; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
    185 	psubw     mm2,mm6	; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
    186 	psubw     mm3,mm7	; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
    187 
    188 	movq      mm5, MMWORD [esi]	; mm5=Y(01234567)
    189 
    190 	pcmpeqw   mm4,mm4
    191 	psrlw     mm4,BYTE_BIT		; mm4={0xFF 0x00 0xFF 0x00 ..}
    192 	pand      mm4,mm5		; mm4=Y(0246)=YE
    193 	psrlw     mm5,BYTE_BIT		; mm5=Y(1357)=YO
    194 
    195 	paddw     mm0,mm4		; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
    196 	paddw     mm1,mm5		; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
    197 	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
    198 	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
    199 
    200 	paddw     mm2,mm4		; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
    201 	paddw     mm3,mm5		; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
    202 	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
    203 	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
    204 
    205 	paddw     mm4, MMWORD [wk(0)]	; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
    206 	paddw     mm5, MMWORD [wk(1)]	; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
    207 	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
    208 	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
    209 
    210 %if RGB_PIXELSIZE == 3 ; ---------------
    211 
    212 	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
    213 	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
    214 	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
    215 	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
    216 
    217 	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
    218 	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
    219 	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
    220 
    221 	movq      mmG,mmA
    222 	movq      mmH,mmA
    223 	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
    224 	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
    225 
    226 	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
    227 	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
    228 
    229 	movq      mmC,mmD
    230 	movq      mmB,mmD
    231 	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
    232 	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
    233 
    234 	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
    235 
    236 	movq      mmF,mmE
    237 	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
    238 	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
    239 
    240 	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
    241 	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
    242 	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
    243 
    244 	cmp	ecx, byte SIZEOF_MMWORD
    245 	jb	short .column_st16
    246 
    247 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
    248 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
    249 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
    250 
    251 	sub	ecx, byte SIZEOF_MMWORD
    252 	jz	short .nextrow
    253 
    254 	add	esi, byte SIZEOF_MMWORD			; inptr0
    255 	add	ebx, byte SIZEOF_MMWORD			; inptr1
    256 	add	edx, byte SIZEOF_MMWORD			; inptr2
    257 	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
    258 	jmp	near .columnloop
    259 	alignx	16,7
    260 
    261 .column_st16:
    262 	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
    263 	cmp	ecx, byte 2*SIZEOF_MMWORD
    264 	jb	short .column_st8
    265 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
    266 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
    267 	movq	mmA,mmC
    268 	sub	ecx, byte 2*SIZEOF_MMWORD
    269 	add	edi, byte 2*SIZEOF_MMWORD
    270 	jmp	short .column_st4
    271 .column_st8:
    272 	cmp	ecx, byte SIZEOF_MMWORD
    273 	jb	short .column_st4
    274 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
    275 	movq	mmA,mmE
    276 	sub	ecx, byte SIZEOF_MMWORD
    277 	add	edi, byte SIZEOF_MMWORD
    278 .column_st4:
    279 	movd	eax,mmA
    280 	cmp	ecx, byte SIZEOF_DWORD
    281 	jb	short .column_st2
    282 	mov	DWORD [edi+0*SIZEOF_DWORD], eax
    283 	psrlq	mmA,DWORD_BIT
    284 	movd	eax,mmA
    285 	sub	ecx, byte SIZEOF_DWORD
    286 	add	edi, byte SIZEOF_DWORD
    287 .column_st2:
    288 	cmp	ecx, byte SIZEOF_WORD
    289 	jb	short .column_st1
    290 	mov	WORD [edi+0*SIZEOF_WORD], ax
    291 	shr	eax,WORD_BIT
    292 	sub	ecx, byte SIZEOF_WORD
    293 	add	edi, byte SIZEOF_WORD
    294 .column_st1:
    295 	cmp	ecx, byte SIZEOF_BYTE
    296 	jb	short .nextrow
    297 	mov	BYTE [edi+0*SIZEOF_BYTE], al
    298 
    299 %else ; RGB_PIXELSIZE == 4 ; -----------
    300 
    301 %ifdef RGBX_FILLER_0XFF
    302 	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
    303 	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
    304 %else
    305 	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
    306 	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
    307 %endif
    308 	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
    309 	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
    310 	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
    311 	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
    312 
    313 	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
    314 	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
    315 	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
    316 	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
    317 
    318 	movq      mmC,mmA
    319 	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
    320 	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
    321 	movq      mmG,mmB
    322 	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
    323 	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
    324 
    325 	movq      mmD,mmA
    326 	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
    327 	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
    328 	movq      mmH,mmC
    329 	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
    330 	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
    331 
    332 	cmp	ecx, byte SIZEOF_MMWORD
    333 	jb	short .column_st16
    334 
    335 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
    336 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
    337 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
    338 	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
    339 
    340 	sub	ecx, byte SIZEOF_MMWORD
    341 	jz	short .nextrow
    342 
    343 	add	esi, byte SIZEOF_MMWORD			; inptr0
    344 	add	ebx, byte SIZEOF_MMWORD			; inptr1
    345 	add	edx, byte SIZEOF_MMWORD			; inptr2
    346 	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
    347 	jmp	near .columnloop
    348 	alignx	16,7
    349 
    350 .column_st16:
    351 	cmp	ecx, byte SIZEOF_MMWORD/2
    352 	jb	short .column_st8
    353 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
    354 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
    355 	movq	mmA,mmC
    356 	movq	mmD,mmH
    357 	sub	ecx, byte SIZEOF_MMWORD/2
    358 	add	edi, byte 2*SIZEOF_MMWORD
    359 .column_st8:
    360 	cmp	ecx, byte SIZEOF_MMWORD/4
    361 	jb	short .column_st4
    362 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
    363 	movq	mmA,mmD
    364 	sub	ecx, byte SIZEOF_MMWORD/4
    365 	add	edi, byte 1*SIZEOF_MMWORD
    366 .column_st4:
    367 	cmp	ecx, byte SIZEOF_MMWORD/8
    368 	jb	short .nextrow
    369 	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
    370 
    371 %endif ; RGB_PIXELSIZE ; ---------------
    372 
    373 	alignx	16,7
    374 
    375 .nextrow:
    376 	pop	ecx
    377 	pop	esi
    378 	pop	ebx
    379 	pop	edx
    380 	pop	edi
    381 	pop	eax
    382 
    383 	add	esi, byte SIZEOF_JSAMPROW
    384 	add	ebx, byte SIZEOF_JSAMPROW
    385 	add	edx, byte SIZEOF_JSAMPROW
    386 	add	edi, byte SIZEOF_JSAMPROW	; output_buf
    387 	dec	eax				; num_rows
    388 	jg	near .rowloop
    389 
    390 	emms		; empty MMX state
    391 
    392 .return:
    393 	pop	edi
    394 	pop	esi
    395 ;	pop	edx		; need not be preserved
    396 ;	pop	ecx		; need not be preserved
    397 	pop	ebx
    398 	mov	esp,ebp		; esp <- aligned ebp
    399 	pop	esp		; esp <- original ebp
    400 	pop	ebp
    401 	ret
    402 
    403 ; For some reason, the OS X linker does not honor the request to align the
    404 ; segment unless we do this.
    405 	align	16
    406