Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright 2009 D. R. Commander
      6 ;
      7 ; Based on
      8 ; x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler),
     13 ; can *not* be assembled with Microsoft's MASM or any compatible
     14 ; assembler (including Borland's Turbo Assembler).
     15 ; NASM is available from http://nasm.sourceforge.net/ or
     16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     17 ;
     18 ; This file contains a floating-point implementation of the inverse DCT
     19 ; (Discrete Cosine Transform). The following code is based directly on
     20 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
     21 ;
     22 ; [TAB8]
     23 
     24 %include "jsimdext.inc"
     25 %include "jdct.inc"
     26 
     27 ; --------------------------------------------------------------------------
     28 
     29 %macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     30 	shufps	%1,%2,0x44
     31 %endmacro
     32 
     33 %macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     34 	shufps	%1,%2,0xEE
     35 %endmacro
     36 
     37 ; --------------------------------------------------------------------------
     38 	SECTION	SEG_CONST
     39 
     40 	alignz	16
     41 	global	EXTN(jconst_idct_float_sse2) PRIVATE
     42 
     43 EXTN(jconst_idct_float_sse2):
     44 
     45 PD_1_414	times 4 dd  1.414213562373095048801689
     46 PD_1_847	times 4 dd  1.847759065022573512256366
     47 PD_1_082	times 4 dd  1.082392200292393968799446
     48 PD_M2_613	times 4 dd -2.613125929752753055713286
     49 PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
     50 PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
     51 
     52 	alignz	16
     53 
     54 ; --------------------------------------------------------------------------
     55 	SECTION	SEG_TEXT
     56 	BITS	64
     57 ;
     58 ; Perform dequantization and inverse DCT on one block of coefficients.
     59 ;
     60 ; GLOBAL(void)
     61 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
     62 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
     63 ;
     64 
     65 ; r10 = void * dct_table
     66 ; r11 = JCOEFPTR coef_block
     67 ; r12 = JSAMPARRAY output_buf
     68 ; r13 = JDIMENSION output_col
     69 
     70 %define original_rbp	rbp+0
     71 %define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
     72 %define WK_NUM		2
     73 %define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
     74 					; FAST_FLOAT workspace[DCTSIZE2]
     75 
     76 	align	16
     77 	global	EXTN(jsimd_idct_float_sse2) PRIVATE
     78 
     79 EXTN(jsimd_idct_float_sse2):
     80 	push	rbp
     81 	mov	rax,rsp				; rax = original rbp
     82 	sub	rsp, byte 4
     83 	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
     84 	mov	[rsp],rax
     85 	mov	rbp,rsp				; rbp = aligned rbp
     86 	lea	rsp, [workspace]
     87 	collect_args
     88 	push	rbx
     89 
     90 	; ---- Pass 1: process columns from input, store into work array.
     91 
     92 	mov	rdx, r10	; quantptr
     93 	mov	rsi, r11		; inptr
     94 	lea	rdi, [workspace]			; FAST_FLOAT * wsptr
     95 	mov	rcx, DCTSIZE/4				; ctr
     96 .columnloop:
     97 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
     98 	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
     99 	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    100 	jnz	near .columnDCT
    101 
    102 	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    103 	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    104 	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    105 	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
    106 	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    107 	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    108 	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    109 	por	xmm1,xmm2
    110 	por	xmm3,xmm4
    111 	por	xmm5,xmm6
    112 	por	xmm1,xmm3
    113 	por	xmm5,xmm7
    114 	por	xmm1,xmm5
    115 	packsswb xmm1,xmm1
    116 	movd	eax,xmm1
    117 	test	rax,rax
    118 	jnz	short .columnDCT
    119 
    120 	; -- AC terms all zero
    121 
    122 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    123 
    124 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
    125 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
    126 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
    127 
    128 	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    129 
    130 	movaps	xmm1,xmm0
    131 	movaps	xmm2,xmm0
    132 	movaps	xmm3,xmm0
    133 
    134 	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
    135 	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
    136 	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
    137 	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
    138 
    139 	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
    140 	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
    141 	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
    142 	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
    143 	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
    144 	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
    145 	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
    146 	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
    147 	jmp	near .nextcolumn
    148 %endif
    149 .columnDCT:
    150 
    151 	; -- Even part
    152 
    153 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    154 	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    155 	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
    156 	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    157 
    158 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
    159 	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
    160 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
    161 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
    162 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
    163 	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
    164 
    165 	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
    166 	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
    167 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
    168 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
    169 	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
    170 	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
    171 
    172 	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    173 	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    174 	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    175 	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    176 
    177 	movaps	xmm4,xmm0
    178 	movaps	xmm5,xmm1
    179 	subps	xmm0,xmm2		; xmm0=tmp11
    180 	subps	xmm1,xmm3
    181 	addps	xmm4,xmm2		; xmm4=tmp10
    182 	addps	xmm5,xmm3		; xmm5=tmp13
    183 
    184 	mulps	xmm1,[rel PD_1_414]
    185 	subps	xmm1,xmm5		; xmm1=tmp12
    186 
    187 	movaps	xmm6,xmm4
    188 	movaps	xmm7,xmm0
    189 	subps	xmm4,xmm5		; xmm4=tmp3
    190 	subps	xmm0,xmm1		; xmm0=tmp2
    191 	addps	xmm6,xmm5		; xmm6=tmp0
    192 	addps	xmm7,xmm1		; xmm7=tmp1
    193 
    194 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
    195 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
    196 
    197 	; -- Odd part
    198 
    199 	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    200 	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    201 	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    202 	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    203 
    204 	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
    205 	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
    206 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
    207 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
    208 	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
    209 	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
    210 
    211 	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
    212 	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
    213 	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
    214 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
    215 	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
    216 	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
    217 
    218 	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    219 	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    220 	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    221 	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    222 
    223 	movaps	xmm4,xmm2
    224 	movaps	xmm0,xmm5
    225 	addps	xmm2,xmm1		; xmm2=z11
    226 	addps	xmm5,xmm3		; xmm5=z13
    227 	subps	xmm4,xmm1		; xmm4=z12
    228 	subps	xmm0,xmm3		; xmm0=z10
    229 
    230 	movaps	xmm1,xmm2
    231 	subps	xmm2,xmm5
    232 	addps	xmm1,xmm5		; xmm1=tmp7
    233 
    234 	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
    235 
    236 	movaps	xmm3,xmm0
    237 	addps	xmm0,xmm4
    238 	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
    239 	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
    240 	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
    241 	addps	xmm3,xmm0		; xmm3=tmp12
    242 	subps	xmm4,xmm0		; xmm4=tmp10
    243 
    244 	; -- Final output stage
    245 
    246 	subps	xmm3,xmm1		; xmm3=tmp6
    247 	movaps	xmm5,xmm6
    248 	movaps	xmm0,xmm7
    249 	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
    250 	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
    251 	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
    252 	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
    253 	subps	xmm2,xmm3		; xmm2=tmp5
    254 
    255 	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
    256 	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
    257 	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
    258 	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
    259 	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
    260 	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
    261 
    262 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
    263 	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
    264 
    265 	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
    266 	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
    267 
    268 	addps	xmm4,xmm2		; xmm4=tmp4
    269 	movaps	xmm0,xmm7
    270 	movaps	xmm3,xmm5
    271 	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
    272 	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
    273 	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
    274 	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
    275 
    276 	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
    277 	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
    278 	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
    279 	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
    280 	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
    281 	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
    282 
    283 	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
    284 	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
    285 	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
    286 	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
    287 	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
    288 	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
    289 
    290 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
    291 	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
    292 
    293 	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
    294 	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
    295 	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
    296 	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
    297 
    298 	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
    299 	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
    300 	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
    301 	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
    302 	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
    303 	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
    304 
    305 	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
    306 	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
    307 	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
    308 	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
    309 
    310 .nextcolumn:
    311 	add	rsi, byte 4*SIZEOF_JCOEF		; coef_block
    312 	add	rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
    313 	add	rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
    314 	dec	rcx					; ctr
    315 	jnz	near .columnloop
    316 
    317 	; -- Prefetch the next coefficient block
    318 
    319 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
    320 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
    321 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
    322 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
    323 
    324 	; ---- Pass 2: process rows from work array, store into output array.
    325 
    326 	mov	rax, [original_rbp]
    327 	lea	rsi, [workspace]			; FAST_FLOAT * wsptr
    328 	mov	rdi, r12	; (JSAMPROW *)
    329 	mov	rax, r13
    330 	mov	rcx, DCTSIZE/4				; ctr
    331 .rowloop:
    332 
    333 	; -- Even part
    334 
    335 	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
    336 	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
    337 	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
    338 	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
    339 
    340 	movaps	xmm4,xmm0
    341 	movaps	xmm5,xmm1
    342 	subps	xmm0,xmm2		; xmm0=tmp11
    343 	subps	xmm1,xmm3
    344 	addps	xmm4,xmm2		; xmm4=tmp10
    345 	addps	xmm5,xmm3		; xmm5=tmp13
    346 
    347 	mulps	xmm1,[rel PD_1_414]
    348 	subps	xmm1,xmm5		; xmm1=tmp12
    349 
    350 	movaps	xmm6,xmm4
    351 	movaps	xmm7,xmm0
    352 	subps	xmm4,xmm5		; xmm4=tmp3
    353 	subps	xmm0,xmm1		; xmm0=tmp2
    354 	addps	xmm6,xmm5		; xmm6=tmp0
    355 	addps	xmm7,xmm1		; xmm7=tmp1
    356 
    357 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
    358 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
    359 
    360 	; -- Odd part
    361 
    362 	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
    363 	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
    364 	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
    365 	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
    366 
    367 	movaps	xmm4,xmm2
    368 	movaps	xmm0,xmm5
    369 	addps	xmm2,xmm1		; xmm2=z11
    370 	addps	xmm5,xmm3		; xmm5=z13
    371 	subps	xmm4,xmm1		; xmm4=z12
    372 	subps	xmm0,xmm3		; xmm0=z10
    373 
    374 	movaps	xmm1,xmm2
    375 	subps	xmm2,xmm5
    376 	addps	xmm1,xmm5		; xmm1=tmp7
    377 
    378 	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
    379 
    380 	movaps	xmm3,xmm0
    381 	addps	xmm0,xmm4
    382 	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
    383 	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
    384 	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
    385 	addps	xmm3,xmm0		; xmm3=tmp12
    386 	subps	xmm4,xmm0		; xmm4=tmp10
    387 
    388 	; -- Final output stage
    389 
    390 	subps	xmm3,xmm1		; xmm3=tmp6
    391 	movaps	xmm5,xmm6
    392 	movaps	xmm0,xmm7
    393 	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
    394 	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
    395 	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
    396 	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
    397 	subps	xmm2,xmm3		; xmm2=tmp5
    398 
    399 	movaps	xmm1,[rel PD_RNDINT_MAGIC]	; xmm1=[rel PD_RNDINT_MAGIC]
    400 	pcmpeqd	xmm3,xmm3
    401 	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    402 
    403 	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
    404 	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
    405 	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
    406 	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
    407 
    408 	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
    409 	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
    410 	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
    411 	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
    412 	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
    413 	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
    414 
    415 	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
    416 	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
    417 
    418 	addps	xmm4,xmm2		; xmm4=tmp4
    419 	movaps	xmm7,xmm1
    420 	movaps	xmm5,xmm3
    421 	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
    422 	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
    423 	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
    424 	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
    425 
    426 	movaps	xmm2,[rel PD_RNDINT_MAGIC]	; xmm2=[rel PD_RNDINT_MAGIC]
    427 	pcmpeqd	xmm4,xmm4
    428 	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    429 
    430 	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
    431 	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
    432 	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
    433 	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
    434 
    435 	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
    436 	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
    437 	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
    438 	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
    439 	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
    440 	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
    441 
    442 	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]
    443 
    444 	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
    445 	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
    446 	paddb     xmm6,xmm2
    447 	paddb     xmm1,xmm2
    448 
    449 	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
    450 	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
    451 	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
    452 
    453 	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
    454 	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
    455 	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
    456 
    457 	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
    458 	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
    459 
    460 	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
    461 	mov	rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
    462 	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
    463 	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
    464 	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
    465 	mov	rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
    466 	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
    467 	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
    468 
    469 	add	rsi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
    470 	add	rdi, byte 4*SIZEOF_JSAMPROW
    471 	dec	rcx				; ctr
    472 	jnz	near .rowloop
    473 
    474 	pop	rbx
    475 	uncollect_args
    476 	mov	rsp,rbp		; rsp <- aligned rbp
    477 	pop	rsp		; rsp <- original rbp
    478 	pop	rbp
    479 	ret
    480 
    481 ; For some reason, the OS X linker does not honor the request to align the
    482 ; segment unless we do this.
    483 	align	16
    484