Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jisseflt.asm - floating-point IDCT (SSE & MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; This file contains a floating-point implementation of the inverse DCT
     18 ; (Discrete Cosine Transform). The following code is based directly on
     19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
     20 ;
     21 ; [TAB8]
     22 
     23 %include "jsimdext.inc"
     24 %include "jdct.inc"
     25 
     26 ; --------------------------------------------------------------------------
     27 
     28 %macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     29 	shufps	%1,%2,0x44
     30 %endmacro
     31 
     32 %macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     33 	shufps	%1,%2,0xEE
     34 %endmacro
     35 
     36 ; --------------------------------------------------------------------------
     37 	SECTION	SEG_CONST
     38 
     39 	alignz	16
     40 	global	EXTN(jconst_idct_float_sse) PRIVATE
     41 
     42 EXTN(jconst_idct_float_sse):
     43 
     44 PD_1_414	times 4 dd  1.414213562373095048801689
     45 PD_1_847	times 4 dd  1.847759065022573512256366
     46 PD_1_082	times 4 dd  1.082392200292393968799446
     47 PD_M2_613	times 4 dd -2.613125929752753055713286
     48 PD_0_125	times 4 dd  0.125	; 1/8
     49 PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
     50 
     51 	alignz	16
     52 
     53 ; --------------------------------------------------------------------------
     54 	SECTION	SEG_TEXT
     55 	BITS	32
     56 ;
     57 ; Perform dequantization and inverse DCT on one block of coefficients.
     58 ;
     59 ; GLOBAL(void)
     60 ; jsimd_idct_float_sse (void * dct_table, JCOEFPTR coef_block,
     61 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
     62 ;
     63 
     64 %define dct_table(b)	(b)+8			; void * dct_table
     65 %define coef_block(b)	(b)+12		; JCOEFPTR coef_block
     66 %define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
     67 %define output_col(b)	(b)+20		; JDIMENSION output_col
     68 
     69 %define original_ebp	ebp+0
     70 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
     71 %define WK_NUM		2
     72 %define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
     73 					; FAST_FLOAT workspace[DCTSIZE2]
     74 
     75 	align	16
     76 	global	EXTN(jsimd_idct_float_sse) PRIVATE
     77 
     78 EXTN(jsimd_idct_float_sse):
     79 	push	ebp
     80 	mov	eax,esp				; eax = original ebp
     81 	sub	esp, byte 4
     82 	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
     83 	mov	[esp],eax
     84 	mov	ebp,esp				; ebp = aligned ebp
     85 	lea	esp, [workspace]
     86 	push	ebx
     87 ;	push	ecx		; need not be preserved
     88 ;	push	edx		; need not be preserved
     89 	push	esi
     90 	push	edi
     91 
     92 	get_GOT	ebx		; get GOT address
     93 
     94 	; ---- Pass 1: process columns from input, store into work array.
     95 
     96 ;	mov	eax, [original_ebp]
     97 	mov	edx, POINTER [dct_table(eax)]	; quantptr
     98 	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
     99 	lea	edi, [workspace]			; FAST_FLOAT * wsptr
    100 	mov	ecx, DCTSIZE/4				; ctr
    101 	alignx	16,7
    102 .columnloop:
    103 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
    104 	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
    105 	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
    106 	jnz	near .columnDCT
    107 
    108 	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    109 	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    110 	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    111 	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
    112 	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    113 	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    114 	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    115 	por	mm1,mm0
    116 	packsswb mm1,mm1
    117 	movd	eax,mm1
    118 	test	eax,eax
    119 	jnz	short .columnDCT
    120 
    121 	; -- AC terms all zero
    122 
    123 	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    124 
    125 	punpckhwd mm1,mm0			; mm1=(** 02 ** 03)
    126 	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
    127 	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in0H=(02 03)
    128 	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
    129 	cvtpi2ps  xmm3,mm1			; xmm3=(02 03 ** **)
    130 	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
    131 	movlhps   xmm0,xmm3			; xmm0=in0=(00 01 02 03)
    132 
    133 	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    134 
    135 	movaps	xmm1,xmm0
    136 	movaps	xmm2,xmm0
    137 	movaps	xmm3,xmm0
    138 
    139 	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
    140 	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
    141 	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
    142 	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
    143 
    144 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    145 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
    146 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
    147 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
    148 	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
    149 	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
    150 	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
    151 	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
    152 	jmp	near .nextcolumn
    153 	alignx	16,7
    154 %endif
    155 .columnDCT:
    156 
    157 	; -- Even part
    158 
    159 	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    160 	movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    161 	movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
    162 	movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    163 
    164 	punpckhwd mm4,mm0			; mm4=(** 02 ** 03)
    165 	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
    166 	punpckhwd mm5,mm1			; mm5=(** 22 ** 23)
    167 	punpcklwd mm1,mm1			; mm1=(20 20 21 21)
    168 
    169 	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in0H=(02 03)
    170 	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
    171 	cvtpi2ps  xmm4,mm4			; xmm4=(02 03 ** **)
    172 	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
    173 	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in2H=(22 23)
    174 	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in2L=(20 21)
    175 	cvtpi2ps  xmm5,mm5			; xmm5=(22 23 ** **)
    176 	cvtpi2ps  xmm1,mm1			; xmm1=(20 21 ** **)
    177 
    178 	punpckhwd mm6,mm2			; mm6=(** 42 ** 43)
    179 	punpcklwd mm2,mm2			; mm2=(40 40 41 41)
    180 	punpckhwd mm7,mm3			; mm7=(** 62 ** 63)
    181 	punpcklwd mm3,mm3			; mm3=(60 60 61 61)
    182 
    183 	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in4H=(42 43)
    184 	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in4L=(40 41)
    185 	cvtpi2ps  xmm6,mm6			; xmm6=(42 43 ** **)
    186 	cvtpi2ps  xmm2,mm2			; xmm2=(40 41 ** **)
    187 	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in6H=(62 63)
    188 	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in6L=(60 61)
    189 	cvtpi2ps  xmm7,mm7			; xmm7=(62 63 ** **)
    190 	cvtpi2ps  xmm3,mm3			; xmm3=(60 61 ** **)
    191 
    192 	movlhps   xmm0,xmm4			; xmm0=in0=(00 01 02 03)
    193 	movlhps   xmm1,xmm5			; xmm1=in2=(20 21 22 23)
    194 	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    195 	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    196 
    197 	movlhps   xmm2,xmm6			; xmm2=in4=(40 41 42 43)
    198 	movlhps   xmm3,xmm7			; xmm3=in6=(60 61 62 63)
    199 	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    200 	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    201 
    202 	movaps	xmm4,xmm0
    203 	movaps	xmm5,xmm1
    204 	subps	xmm0,xmm2		; xmm0=tmp11
    205 	subps	xmm1,xmm3
    206 	addps	xmm4,xmm2		; xmm4=tmp10
    207 	addps	xmm5,xmm3		; xmm5=tmp13
    208 
    209 	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
    210 	subps	xmm1,xmm5		; xmm1=tmp12
    211 
    212 	movaps	xmm6,xmm4
    213 	movaps	xmm7,xmm0
    214 	subps	xmm4,xmm5		; xmm4=tmp3
    215 	subps	xmm0,xmm1		; xmm0=tmp2
    216 	addps	xmm6,xmm5		; xmm6=tmp0
    217 	addps	xmm7,xmm1		; xmm7=tmp1
    218 
    219 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
    220 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
    221 
    222 	; -- Odd part
    223 
    224 	movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    225 	movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    226 	movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    227 	movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    228 
    229 	punpckhwd mm6,mm4			; mm6=(** 12 ** 13)
    230 	punpcklwd mm4,mm4			; mm4=(10 10 11 11)
    231 	punpckhwd mm2,mm0			; mm2=(** 32 ** 33)
    232 	punpcklwd mm0,mm0			; mm0=(30 30 31 31)
    233 
    234 	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in1H=(12 13)
    235 	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in1L=(10 11)
    236 	cvtpi2ps  xmm4,mm6			; xmm4=(12 13 ** **)
    237 	cvtpi2ps  xmm2,mm4			; xmm2=(10 11 ** **)
    238 	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in3H=(32 33)
    239 	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in3L=(30 31)
    240 	cvtpi2ps  xmm0,mm2			; xmm0=(32 33 ** **)
    241 	cvtpi2ps  xmm3,mm0			; xmm3=(30 31 ** **)
    242 
    243 	punpckhwd mm7,mm5			; mm7=(** 52 ** 53)
    244 	punpcklwd mm5,mm5			; mm5=(50 50 51 51)
    245 	punpckhwd mm3,mm1			; mm3=(** 72 ** 73)
    246 	punpcklwd mm1,mm1			; mm1=(70 70 71 71)
    247 
    248 	movlhps   xmm2,xmm4			; xmm2=in1=(10 11 12 13)
    249 	movlhps   xmm3,xmm0			; xmm3=in3=(30 31 32 33)
    250 
    251 	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in5H=(52 53)
    252 	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in5L=(50 51)
    253 	cvtpi2ps  xmm4,mm7			; xmm4=(52 53 ** **)
    254 	cvtpi2ps  xmm5,mm5			; xmm5=(50 51 ** **)
    255 	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in7H=(72 73)
    256 	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in7L=(70 71)
    257 	cvtpi2ps  xmm0,mm3			; xmm0=(72 73 ** **)
    258 	cvtpi2ps  xmm1,mm1			; xmm1=(70 71 ** **)
    259 
    260 	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    261 	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    262 
    263 	movlhps   xmm5,xmm4			; xmm5=in5=(50 51 52 53)
    264 	movlhps   xmm1,xmm0			; xmm1=in7=(70 71 72 73)
    265 	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    266 	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    267 
    268 	movaps	xmm4,xmm2
    269 	movaps	xmm0,xmm5
    270 	addps	xmm2,xmm1		; xmm2=z11
    271 	addps	xmm5,xmm3		; xmm5=z13
    272 	subps	xmm4,xmm1		; xmm4=z12
    273 	subps	xmm0,xmm3		; xmm0=z10
    274 
    275 	movaps	xmm1,xmm2
    276 	subps	xmm2,xmm5
    277 	addps	xmm1,xmm5		; xmm1=tmp7
    278 
    279 	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
    280 
    281 	movaps	xmm3,xmm0
    282 	addps	xmm0,xmm4
    283 	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
    284 	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
    285 	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
    286 	addps	xmm3,xmm0		; xmm3=tmp12
    287 	subps	xmm4,xmm0		; xmm4=tmp10
    288 
    289 	; -- Final output stage
    290 
    291 	subps	xmm3,xmm1		; xmm3=tmp6
    292 	movaps	xmm5,xmm6
    293 	movaps	xmm0,xmm7
    294 	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
    295 	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
    296 	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
    297 	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
    298 	subps	xmm2,xmm3		; xmm2=tmp5
    299 
    300 	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
    301 	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
    302 	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
    303 	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
    304 	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
    305 	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
    306 
    307 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
    308 	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
    309 
    310 	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
    311 	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
    312 
    313 	addps	xmm4,xmm2		; xmm4=tmp4
    314 	movaps	xmm0,xmm7
    315 	movaps	xmm3,xmm5
    316 	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
    317 	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
    318 	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
    319 	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
    320 
    321 	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
    322 	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
    323 	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
    324 	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
    325 	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
    326 	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
    327 
    328 	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
    329 	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
    330 	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
    331 	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
    332 	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
    333 	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
    334 
    335 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
    336 	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
    337 
    338 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
    339 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
    340 	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
    341 	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    342 
    343 	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
    344 	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
    345 	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
    346 	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
    347 	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
    348 	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
    349 
    350 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
    351 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
    352 	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
    353 	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
    354 
    355 .nextcolumn:
    356 	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
    357 	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
    358 	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
    359 	dec	ecx					; ctr
    360 	jnz	near .columnloop
    361 
    362 	; -- Prefetch the next coefficient block
    363 
    364 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
    365 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
    366 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
    367 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
    368 
    369 	; ---- Pass 2: process rows from work array, store into output array.
    370 
    371 	mov	eax, [original_ebp]
    372 	lea	esi, [workspace]			; FAST_FLOAT * wsptr
    373 	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
    374 	mov	eax, JDIMENSION [output_col(eax)]
    375 	mov	ecx, DCTSIZE/4				; ctr
    376 	alignx	16,7
    377 .rowloop:
    378 
    379 	; -- Even part
    380 
    381 	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
    382 	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
    383 	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
    384 	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
    385 
    386 	movaps	xmm4,xmm0
    387 	movaps	xmm5,xmm1
    388 	subps	xmm0,xmm2		; xmm0=tmp11
    389 	subps	xmm1,xmm3
    390 	addps	xmm4,xmm2		; xmm4=tmp10
    391 	addps	xmm5,xmm3		; xmm5=tmp13
    392 
    393 	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
    394 	subps	xmm1,xmm5		; xmm1=tmp12
    395 
    396 	movaps	xmm6,xmm4
    397 	movaps	xmm7,xmm0
    398 	subps	xmm4,xmm5		; xmm4=tmp3
    399 	subps	xmm0,xmm1		; xmm0=tmp2
    400 	addps	xmm6,xmm5		; xmm6=tmp0
    401 	addps	xmm7,xmm1		; xmm7=tmp1
    402 
    403 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
    404 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
    405 
    406 	; -- Odd part
    407 
    408 	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
    409 	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
    410 	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
    411 	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
    412 
    413 	movaps	xmm4,xmm2
    414 	movaps	xmm0,xmm5
    415 	addps	xmm2,xmm1		; xmm2=z11
    416 	addps	xmm5,xmm3		; xmm5=z13
    417 	subps	xmm4,xmm1		; xmm4=z12
    418 	subps	xmm0,xmm3		; xmm0=z10
    419 
    420 	movaps	xmm1,xmm2
    421 	subps	xmm2,xmm5
    422 	addps	xmm1,xmm5		; xmm1=tmp7
    423 
    424 	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
    425 
    426 	movaps	xmm3,xmm0
    427 	addps	xmm0,xmm4
    428 	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
    429 	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
    430 	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
    431 	addps	xmm3,xmm0		; xmm3=tmp12
    432 	subps	xmm4,xmm0		; xmm4=tmp10
    433 
    434 	; -- Final output stage
    435 
    436 	subps	xmm3,xmm1		; xmm3=tmp6
    437 	movaps	xmm5,xmm6
    438 	movaps	xmm0,xmm7
    439 	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
    440 	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
    441 	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
    442 	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
    443 	subps	xmm2,xmm3		; xmm2=tmp5
    444 
    445 	movaps	xmm1,[GOTOFF(ebx,PD_0_125)]	; xmm1=[PD_0_125]
    446 
    447 	mulps	xmm6,xmm1		; descale(1/8)
    448 	mulps	xmm7,xmm1		; descale(1/8)
    449 	mulps	xmm5,xmm1		; descale(1/8)
    450 	mulps	xmm0,xmm1		; descale(1/8)
    451 
    452 	movhlps   xmm3,xmm6
    453 	movhlps   xmm1,xmm7
    454 	cvtps2pi  mm0,xmm6		; round to int32, mm0=data0L=(00 10)
    455 	cvtps2pi  mm1,xmm7		; round to int32, mm1=data1L=(01 11)
    456 	cvtps2pi  mm2,xmm3		; round to int32, mm2=data0H=(20 30)
    457 	cvtps2pi  mm3,xmm1		; round to int32, mm3=data1H=(21 31)
    458 	packssdw  mm0,mm2		; mm0=data0=(00 10 20 30)
    459 	packssdw  mm1,mm3		; mm1=data1=(01 11 21 31)
    460 
    461 	movhlps   xmm6,xmm5
    462 	movhlps   xmm7,xmm0
    463 	cvtps2pi  mm4,xmm5		; round to int32, mm4=data7L=(07 17)
    464 	cvtps2pi  mm5,xmm0		; round to int32, mm5=data6L=(06 16)
    465 	cvtps2pi  mm6,xmm6		; round to int32, mm6=data7H=(27 37)
    466 	cvtps2pi  mm7,xmm7		; round to int32, mm7=data6H=(26 36)
    467 	packssdw  mm4,mm6		; mm4=data7=(07 17 27 37)
    468 	packssdw  mm5,mm7		; mm5=data6=(06 16 26 36)
    469 
    470 	packsswb  mm0,mm5		; mm0=(00 10 20 30 06 16 26 36)
    471 	packsswb  mm1,mm4		; mm1=(01 11 21 31 07 17 27 37)
    472 
    473 	movaps	xmm3, XMMWORD [wk(0)]	; xmm3=tmp2
    474 	movaps	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
    475 
    476 	movaps	xmm6,[GOTOFF(ebx,PD_0_125)]	; xmm6=[PD_0_125]
    477 
    478 	addps	xmm4,xmm2		; xmm4=tmp4
    479 	movaps	xmm5,xmm3
    480 	movaps	xmm0,xmm1
    481 	addps	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
    482 	addps	xmm1,xmm4		; xmm1=data4=(04 14 24 34)
    483 	subps	xmm5,xmm2		; xmm5=data5=(05 15 25 35)
    484 	subps	xmm0,xmm4		; xmm0=data3=(03 13 23 33)
    485 
    486 	mulps	xmm3,xmm6		; descale(1/8)
    487 	mulps	xmm1,xmm6		; descale(1/8)
    488 	mulps	xmm5,xmm6		; descale(1/8)
    489 	mulps	xmm0,xmm6		; descale(1/8)
    490 
    491 	movhlps   xmm7,xmm3
    492 	movhlps   xmm2,xmm1
    493 	cvtps2pi  mm2,xmm3		; round to int32, mm2=data2L=(02 12)
    494 	cvtps2pi  mm3,xmm1		; round to int32, mm3=data4L=(04 14)
    495 	cvtps2pi  mm6,xmm7		; round to int32, mm6=data2H=(22 32)
    496 	cvtps2pi  mm7,xmm2		; round to int32, mm7=data4H=(24 34)
    497 	packssdw  mm2,mm6		; mm2=data2=(02 12 22 32)
    498 	packssdw  mm3,mm7		; mm3=data4=(04 14 24 34)
    499 
    500 	movhlps   xmm4,xmm5
    501 	movhlps   xmm6,xmm0
    502 	cvtps2pi  mm5,xmm5		; round to int32, mm5=data5L=(05 15)
    503 	cvtps2pi  mm4,xmm0		; round to int32, mm4=data3L=(03 13)
    504 	cvtps2pi  mm6,xmm4		; round to int32, mm6=data5H=(25 35)
    505 	cvtps2pi  mm7,xmm6		; round to int32, mm7=data3H=(23 33)
    506 	packssdw  mm5,mm6		; mm5=data5=(05 15 25 35)
    507 	packssdw  mm4,mm7		; mm4=data3=(03 13 23 33)
    508 
    509 	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
    510 
    511 	packsswb  mm2,mm3		; mm2=(02 12 22 32 04 14 24 34)
    512 	packsswb  mm4,mm5		; mm4=(03 13 23 33 05 15 25 35)
    513 
    514 	paddb     mm0,mm6
    515 	paddb     mm1,mm6
    516 	paddb     mm2,mm6
    517 	paddb     mm4,mm6
    518 
    519 	movq      mm7,mm0		; transpose coefficients(phase 1)
    520 	punpcklbw mm0,mm1		; mm0=(00 01 10 11 20 21 30 31)
    521 	punpckhbw mm7,mm1		; mm7=(06 07 16 17 26 27 36 37)
    522 	movq      mm3,mm2		; transpose coefficients(phase 1)
    523 	punpcklbw mm2,mm4		; mm2=(02 03 12 13 22 23 32 33)
    524 	punpckhbw mm3,mm4		; mm3=(04 05 14 15 24 25 34 35)
    525 
    526 	movq      mm5,mm0		; transpose coefficients(phase 2)
    527 	punpcklwd mm0,mm2		; mm0=(00 01 02 03 10 11 12 13)
    528 	punpckhwd mm5,mm2		; mm5=(20 21 22 23 30 31 32 33)
    529 	movq      mm6,mm3		; transpose coefficients(phase 2)
    530 	punpcklwd mm3,mm7		; mm3=(04 05 06 07 14 15 16 17)
    531 	punpckhwd mm6,mm7		; mm6=(24 25 26 27 34 35 36 37)
    532 
    533 	movq      mm1,mm0		; transpose coefficients(phase 3)
    534 	punpckldq mm0,mm3		; mm0=(00 01 02 03 04 05 06 07)
    535 	punpckhdq mm1,mm3		; mm1=(10 11 12 13 14 15 16 17)
    536 	movq      mm4,mm5		; transpose coefficients(phase 3)
    537 	punpckldq mm5,mm6		; mm5=(20 21 22 23 24 25 26 27)
    538 	punpckhdq mm4,mm6		; mm4=(30 31 32 33 34 35 36 37)
    539 
    540 	pushpic	ebx			; save GOT address
    541 
    542 	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
    543 	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
    544 	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
    545 	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
    546 	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
    547 	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
    548 	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
    549 	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
    550 
    551 	poppic	ebx			; restore GOT address
    552 
    553 	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
    554 	add	edi, byte 4*SIZEOF_JSAMPROW
    555 	dec	ecx				; ctr
    556 	jnz	near .rowloop
    557 
    558 	emms		; empty MMX state
    559 
    560 	pop	edi
    561 	pop	esi
    562 ;	pop	edx		; need not be preserved
    563 ;	pop	ecx		; need not be preserved
    564 	pop	ebx
    565 	mov	esp,ebp		; esp <- aligned ebp
    566 	pop	esp		; esp <- original ebp
    567 	pop	ebp
    568 	ret
    569 
    570 ; For some reason, the OS X linker does not honor the request to align the
    571 ; segment unless we do this.
    572 	align	16
    573