Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jiss2flt.asm - floating-point IDCT (SSE & SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; This file contains a floating-point implementation of the inverse DCT
     18 ; (Discrete Cosine Transform). The following code is based directly on
     19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
     20 ;
     21 ; [TAB8]
     22 
     23 %include "jsimdext.inc"
     24 %include "jdct.inc"
     25 
     26 ; --------------------------------------------------------------------------
     27 
     28 %macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     29 	shufps	%1,%2,0x44
     30 %endmacro
     31 
     32 %macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     33 	shufps	%1,%2,0xEE
     34 %endmacro
     35 
     36 ; --------------------------------------------------------------------------
     37 	SECTION	SEG_CONST
     38 
     39 	alignz	16
     40 	global	EXTN(jconst_idct_float_sse2) PRIVATE
     41 
     42 EXTN(jconst_idct_float_sse2):
     43 
     44 PD_1_414	times 4 dd  1.414213562373095048801689
     45 PD_1_847	times 4 dd  1.847759065022573512256366
     46 PD_1_082	times 4 dd  1.082392200292393968799446
     47 PD_M2_613	times 4 dd -2.613125929752753055713286
     48 PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
     49 PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
     50 
     51 	alignz	16
     52 
     53 ; --------------------------------------------------------------------------
     54 	SECTION	SEG_TEXT
     55 	BITS	32
     56 ;
     57 ; Perform dequantization and inverse DCT on one block of coefficients.
     58 ;
     59 ; GLOBAL(void)
     60 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
     61 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
     62 ;
     63 
     64 %define dct_table(b)	(b)+8			; void * dct_table
     65 %define coef_block(b)	(b)+12		; JCOEFPTR coef_block
     66 %define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
     67 %define output_col(b)	(b)+20		; JDIMENSION output_col
     68 
     69 %define original_ebp	ebp+0
     70 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
     71 %define WK_NUM		2
     72 %define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
     73 					; FAST_FLOAT workspace[DCTSIZE2]
     74 
     75 	align	16
     76 	global	EXTN(jsimd_idct_float_sse2) PRIVATE
     77 
     78 EXTN(jsimd_idct_float_sse2):
     79 	push	ebp
     80 	mov	eax,esp				; eax = original ebp
     81 	sub	esp, byte 4
     82 	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
     83 	mov	[esp],eax
     84 	mov	ebp,esp				; ebp = aligned ebp
     85 	lea	esp, [workspace]
     86 	push	ebx
     87 ;	push	ecx		; need not be preserved
     88 ;	push	edx		; need not be preserved
     89 	push	esi
     90 	push	edi
     91 
     92 	get_GOT	ebx		; get GOT address
     93 
     94 	; ---- Pass 1: process columns from input, store into work array.
     95 
     96 ;	mov	eax, [original_ebp]
     97 	mov	edx, POINTER [dct_table(eax)]	; quantptr
     98 	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
     99 	lea	edi, [workspace]			; FAST_FLOAT * wsptr
    100 	mov	ecx, DCTSIZE/4				; ctr
    101 	alignx	16,7
    102 .columnloop:
    103 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
    104 	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
    105 	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
    106 	jnz	near .columnDCT
    107 
    108 	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    109 	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    110 	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    111 	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
    112 	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    113 	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    114 	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    115 	por	xmm1,xmm2
    116 	por	xmm3,xmm4
    117 	por	xmm5,xmm6
    118 	por	xmm1,xmm3
    119 	por	xmm5,xmm7
    120 	por	xmm1,xmm5
    121 	packsswb xmm1,xmm1
    122 	movd	eax,xmm1
    123 	test	eax,eax
    124 	jnz	short .columnDCT
    125 
    126 	; -- AC terms all zero
    127 
    128 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    129 
    130 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
    131 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
    132 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
    133 
    134 	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    135 
    136 	movaps	xmm1,xmm0
    137 	movaps	xmm2,xmm0
    138 	movaps	xmm3,xmm0
    139 
    140 	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
    141 	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
    142 	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
    143 	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
    144 
    145 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    146 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
    147 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
    148 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
    149 	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
    150 	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
    151 	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
    152 	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
    153 	jmp	near .nextcolumn
    154 	alignx	16,7
    155 %endif
    156 .columnDCT:
    157 
    158 	; -- Even part
    159 
    160 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    161 	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    162 	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
    163 	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    164 
    165 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
    166 	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
    167 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
    168 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
    169 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
    170 	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
    171 
    172 	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
    173 	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
    174 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
    175 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
    176 	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
    177 	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
    178 
    179 	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    180 	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    181 	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    182 	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    183 
    184 	movaps	xmm4,xmm0
    185 	movaps	xmm5,xmm1
    186 	subps	xmm0,xmm2		; xmm0=tmp11
    187 	subps	xmm1,xmm3
    188 	addps	xmm4,xmm2		; xmm4=tmp10
    189 	addps	xmm5,xmm3		; xmm5=tmp13
    190 
    191 	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
    192 	subps	xmm1,xmm5		; xmm1=tmp12
    193 
    194 	movaps	xmm6,xmm4
    195 	movaps	xmm7,xmm0
    196 	subps	xmm4,xmm5		; xmm4=tmp3
    197 	subps	xmm0,xmm1		; xmm0=tmp2
    198 	addps	xmm6,xmm5		; xmm6=tmp0
    199 	addps	xmm7,xmm1		; xmm7=tmp1
    200 
    201 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
    202 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
    203 
    204 	; -- Odd part
    205 
    206 	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    207 	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    208 	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    209 	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    210 
    211 	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
    212 	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
    213 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
    214 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
    215 	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
    216 	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
    217 
    218 	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
    219 	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
    220 	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
    221 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
    222 	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
    223 	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
    224 
    225 	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    226 	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    227 	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    228 	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    229 
    230 	movaps	xmm4,xmm2
    231 	movaps	xmm0,xmm5
    232 	addps	xmm2,xmm1		; xmm2=z11
    233 	addps	xmm5,xmm3		; xmm5=z13
    234 	subps	xmm4,xmm1		; xmm4=z12
    235 	subps	xmm0,xmm3		; xmm0=z10
    236 
    237 	movaps	xmm1,xmm2
    238 	subps	xmm2,xmm5
    239 	addps	xmm1,xmm5		; xmm1=tmp7
    240 
    241 	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
    242 
    243 	movaps	xmm3,xmm0
    244 	addps	xmm0,xmm4
    245 	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
    246 	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
    247 	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
    248 	addps	xmm3,xmm0		; xmm3=tmp12
    249 	subps	xmm4,xmm0		; xmm4=tmp10
    250 
    251 	; -- Final output stage
    252 
    253 	subps	xmm3,xmm1		; xmm3=tmp6
    254 	movaps	xmm5,xmm6
    255 	movaps	xmm0,xmm7
    256 	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
    257 	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
    258 	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
    259 	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
    260 	subps	xmm2,xmm3		; xmm2=tmp5
    261 
    262 	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
    263 	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
    264 	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
    265 	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
    266 	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
    267 	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
    268 
    269 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
    270 	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
    271 
    272 	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
    273 	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
    274 
    275 	addps	xmm4,xmm2		; xmm4=tmp4
    276 	movaps	xmm0,xmm7
    277 	movaps	xmm3,xmm5
    278 	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
    279 	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
    280 	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
    281 	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
    282 
    283 	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
    284 	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
    285 	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
    286 	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
    287 	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
    288 	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
    289 
    290 	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
    291 	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
    292 	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
    293 	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
    294 	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
    295 	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
    296 
    297 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
    298 	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
    299 
    300 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
    301 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
    302 	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
    303 	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    304 
    305 	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
    306 	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
    307 	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
    308 	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
    309 	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
    310 	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
    311 
    312 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
    313 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
    314 	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
    315 	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
    316 
    317 .nextcolumn:
    318 	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
    319 	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
    320 	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
    321 	dec	ecx					; ctr
    322 	jnz	near .columnloop
    323 
    324 	; -- Prefetch the next coefficient block
    325 
    326 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
    327 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
    328 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
    329 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
    330 
    331 	; ---- Pass 2: process rows from work array, store into output array.
    332 
    333 	mov	eax, [original_ebp]
    334 	lea	esi, [workspace]			; FAST_FLOAT * wsptr
    335 	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
    336 	mov	eax, JDIMENSION [output_col(eax)]
    337 	mov	ecx, DCTSIZE/4				; ctr
    338 	alignx	16,7
    339 .rowloop:
    340 
    341 	; -- Even part
    342 
    343 	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
    344 	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
    345 	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
    346 	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
    347 
    348 	movaps	xmm4,xmm0
    349 	movaps	xmm5,xmm1
    350 	subps	xmm0,xmm2		; xmm0=tmp11
    351 	subps	xmm1,xmm3
    352 	addps	xmm4,xmm2		; xmm4=tmp10
    353 	addps	xmm5,xmm3		; xmm5=tmp13
    354 
    355 	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
    356 	subps	xmm1,xmm5		; xmm1=tmp12
    357 
    358 	movaps	xmm6,xmm4
    359 	movaps	xmm7,xmm0
    360 	subps	xmm4,xmm5		; xmm4=tmp3
    361 	subps	xmm0,xmm1		; xmm0=tmp2
    362 	addps	xmm6,xmm5		; xmm6=tmp0
    363 	addps	xmm7,xmm1		; xmm7=tmp1
    364 
    365 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
    366 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
    367 
    368 	; -- Odd part
    369 
    370 	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
    371 	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
    372 	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
    373 	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
    374 
    375 	movaps	xmm4,xmm2
    376 	movaps	xmm0,xmm5
    377 	addps	xmm2,xmm1		; xmm2=z11
    378 	addps	xmm5,xmm3		; xmm5=z13
    379 	subps	xmm4,xmm1		; xmm4=z12
    380 	subps	xmm0,xmm3		; xmm0=z10
    381 
    382 	movaps	xmm1,xmm2
    383 	subps	xmm2,xmm5
    384 	addps	xmm1,xmm5		; xmm1=tmp7
    385 
    386 	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
    387 
    388 	movaps	xmm3,xmm0
    389 	addps	xmm0,xmm4
    390 	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
    391 	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
    392 	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
    393 	addps	xmm3,xmm0		; xmm3=tmp12
    394 	subps	xmm4,xmm0		; xmm4=tmp10
    395 
    396 	; -- Final output stage
    397 
    398 	subps	xmm3,xmm1		; xmm3=tmp6
    399 	movaps	xmm5,xmm6
    400 	movaps	xmm0,xmm7
    401 	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
    402 	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
    403 	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
    404 	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
    405 	subps	xmm2,xmm3		; xmm2=tmp5
    406 
    407 	movaps	xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm1=[PD_RNDINT_MAGIC]
    408 	pcmpeqd	xmm3,xmm3
    409 	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    410 
    411 	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
    412 	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
    413 	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
    414 	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
    415 
    416 	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
    417 	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
    418 	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
    419 	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
    420 	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
    421 	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
    422 
    423 	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
    424 	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
    425 
    426 	addps	xmm4,xmm2		; xmm4=tmp4
    427 	movaps	xmm7,xmm1
    428 	movaps	xmm5,xmm3
    429 	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
    430 	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
    431 	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
    432 	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
    433 
    434 	movaps	xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm2=[PD_RNDINT_MAGIC]
    435 	pcmpeqd	xmm4,xmm4
    436 	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    437 
    438 	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
    439 	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
    440 	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
    441 	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
    442 
    443 	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
    444 	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
    445 	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
    446 	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
    447 	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
    448 	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
    449 
    450 	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
    451 
    452 	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
    453 	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
    454 	paddb     xmm6,xmm2
    455 	paddb     xmm1,xmm2
    456 
    457 	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
    458 	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
    459 	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
    460 
    461 	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
    462 	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
    463 	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
    464 
    465 	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
    466 	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
    467 
    468 	pushpic	ebx			; save GOT address
    469 
    470 	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
    471 	mov	ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
    472 	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
    473 	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
    474 	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
    475 	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
    476 	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
    477 	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
    478 
    479 	poppic	ebx			; restore GOT address
    480 
    481 	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
    482 	add	edi, byte 4*SIZEOF_JSAMPROW
    483 	dec	ecx				; ctr
    484 	jnz	near .rowloop
    485 
    486 	pop	edi
    487 	pop	esi
    488 ;	pop	edx		; need not be preserved
    489 ;	pop	ecx		; need not be preserved
    490 	pop	ebx
    491 	mov	esp,ebp		; esp <- aligned ebp
    492 	pop	esp		; esp <- original ebp
    493 	pop	ebp
    494 	ret
    495 
    496 ; For some reason, the OS X linker does not honor the request to align the
    497 ; segment unless we do this.
    498 	align	16
    499