Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jdsamss2.asm - upsampling (SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 	SECTION	SEG_CONST
     23 
     24 	alignz	16
     25 	global	EXTN(jconst_fancy_upsample_sse2) PRIVATE
     26 
     27 EXTN(jconst_fancy_upsample_sse2):
     28 
     29 PW_ONE		times 8 dw  1
     30 PW_TWO		times 8 dw  2
     31 PW_THREE	times 8 dw  3
     32 PW_SEVEN	times 8 dw  7
     33 PW_EIGHT	times 8 dw  8
     34 
     35 	alignz	16
     36 
     37 ; --------------------------------------------------------------------------
     38 	SECTION	SEG_TEXT
     39 	BITS	32
     40 ;
     41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
     42 ;
     43 ; The upsampling algorithm is linear interpolation between pixel centers,
     44 ; also known as a "triangle filter".  This is a good compromise between
     45 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
     46 ; of the way between input pixel centers.
     47 ;
     48 ; GLOBAL(void)
     49 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
     50 ;                                 JDIMENSION downsampled_width,
     51 ;                                 JSAMPARRAY input_data,
     52 ;                                 JSAMPARRAY * output_data_ptr);
     53 ;
     54 
     55 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
     56 %define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
     57 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
     58 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
     59 
     60 	align	16
     61 	global	EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
     62 
     63 EXTN(jsimd_h2v1_fancy_upsample_sse2):
     64 	push	ebp
     65 	mov	ebp,esp
     66 	pushpic	ebx
     67 ;	push	ecx		; need not be preserved
     68 ;	push	edx		; need not be preserved
     69 	push	esi
     70 	push	edi
     71 
     72 	get_GOT	ebx		; get GOT address
     73 
     74 	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
     75 	test	eax,eax
     76 	jz	near .return
     77 
     78 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
     79 	test	ecx,ecx
     80 	jz	near .return
     81 
     82 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
     83 	mov	edi, POINTER [output_data_ptr(ebp)]
     84 	mov	edi, JSAMPARRAY [edi]			; output_data
     85 	alignx	16,7
     86 .rowloop:
     87 	push	eax			; colctr
     88 	push	edi
     89 	push	esi
     90 
     91 	mov	esi, JSAMPROW [esi]	; inptr
     92 	mov	edi, JSAMPROW [edi]	; outptr
     93 
     94 	test	eax, SIZEOF_XMMWORD-1
     95 	jz	short .skip
     96 	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
     97 	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
     98 .skip:
     99 	pxor	xmm0,xmm0		; xmm0=(all 0's)
    100 	pcmpeqb	xmm7,xmm7
    101 	psrldq	xmm7,(SIZEOF_XMMWORD-1)
    102 	pand	xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
    103 
    104 	add	eax, byte SIZEOF_XMMWORD-1
    105 	and	eax, byte -SIZEOF_XMMWORD
    106 	cmp	eax, byte SIZEOF_XMMWORD
    107 	ja	short .columnloop
    108 	alignx	16,7
    109 
    110 .columnloop_last:
    111 	pcmpeqb	xmm6,xmm6
    112 	pslldq	xmm6,(SIZEOF_XMMWORD-1)
    113 	pand	xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
    114 	jmp	short .upsample
    115 	alignx	16,7
    116 
    117 .columnloop:
    118 	movdqa	xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
    119 	pslldq	xmm6,(SIZEOF_XMMWORD-1)
    120 
    121 .upsample:
    122 	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    123 	movdqa	xmm2,xmm1
    124 	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
    125 	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
    126 	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
    127 
    128 	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
    129 	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
    130 
    131 	movdqa	xmm7,xmm1
    132 	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
    133 
    134 	movdqa    xmm4,xmm1
    135 	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
    136 	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
    137 	movdqa    xmm5,xmm2
    138 	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
    139 	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
    140 	movdqa    xmm6,xmm3
    141 	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
    142 	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
    143 
    144 	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]
    145 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
    146 	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]
    147 	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]
    148 	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]
    149 	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]
    150 
    151 	paddw	xmm2,xmm1
    152 	paddw	xmm5,xmm4
    153 	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
    154 	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
    155 	paddw	xmm3,xmm1
    156 	paddw	xmm6,xmm4
    157 	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
    158 	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
    159 
    160 	psllw	xmm3,BYTE_BIT
    161 	psllw	xmm6,BYTE_BIT
    162 	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
    163 	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
    164 
    165 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
    166 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
    167 
    168 	sub	eax, byte SIZEOF_XMMWORD
    169 	add	esi, byte 1*SIZEOF_XMMWORD	; inptr
    170 	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
    171 	cmp	eax, byte SIZEOF_XMMWORD
    172 	ja	near .columnloop
    173 	test	eax,eax
    174 	jnz	near .columnloop_last
    175 
    176 	pop	esi
    177 	pop	edi
    178 	pop	eax
    179 
    180 	add	esi, byte SIZEOF_JSAMPROW	; input_data
    181 	add	edi, byte SIZEOF_JSAMPROW	; output_data
    182 	dec	ecx				; rowctr
    183 	jg	near .rowloop
    184 
    185 .return:
    186 	pop	edi
    187 	pop	esi
    188 ;	pop	edx		; need not be preserved
    189 ;	pop	ecx		; need not be preserved
    190 	poppic	ebx
    191 	pop	ebp
    192 	ret
    193 
    194 ; --------------------------------------------------------------------------
    195 ;
    196 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    197 ; Again a triangle filter; see comments for h2v1 case, above.
    198 ;
    199 ; GLOBAL(void)
    200 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
    201 ;                                 JDIMENSION downsampled_width,
    202 ;                                 JSAMPARRAY input_data,
    203 ;                                 JSAMPARRAY * output_data_ptr);
    204 ;
    205 
    206 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
    207 %define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
    208 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
    209 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    210 
    211 %define original_ebp	ebp+0
    212 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
    213 %define WK_NUM		4
    214 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
    215 
    216 	align	16
    217 	global	EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
    218 
    219 EXTN(jsimd_h2v2_fancy_upsample_sse2):
    220 	push	ebp
    221 	mov	eax,esp				; eax = original ebp
    222 	sub	esp, byte 4
    223 	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
    224 	mov	[esp],eax
    225 	mov	ebp,esp				; ebp = aligned ebp
    226 	lea	esp, [wk(0)]
    227 	pushpic	eax		; make a room for GOT address
    228 	push	ebx
    229 ;	push	ecx		; need not be preserved
    230 ;	push	edx		; need not be preserved
    231 	push	esi
    232 	push	edi
    233 
    234 	get_GOT	ebx			; get GOT address
    235 	movpic	POINTER [gotptr], ebx	; save GOT address
    236 
    237 	mov	edx,eax				; edx = original ebp
    238 	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
    239 	test	eax,eax
    240 	jz	near .return
    241 
    242 	mov	ecx, INT [max_v_samp(edx)]	; rowctr
    243 	test	ecx,ecx
    244 	jz	near .return
    245 
    246 	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
    247 	mov	edi, POINTER [output_data_ptr(edx)]
    248 	mov	edi, JSAMPARRAY [edi]			; output_data
    249 	alignx	16,7
    250 .rowloop:
    251 	push	eax					; colctr
    252 	push	ecx
    253 	push	edi
    254 	push	esi
    255 
    256 	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
    257 	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
    258 	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
    259 	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
    260 	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
    261 
    262 	test	eax, SIZEOF_XMMWORD-1
    263 	jz	short .skip
    264 	push	edx
    265 	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
    266 	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
    267 	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
    268 	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
    269 	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
    270 	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
    271 	pop	edx
    272 .skip:
    273 	; -- process the first column block
    274 
    275 	movdqa	xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
    276 	movdqa	xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
    277 	movdqa	xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
    278 
    279 	pushpic	ebx
    280 	movpic	ebx, POINTER [gotptr]	; load GOT address
    281 
    282 	pxor      xmm3,xmm3		; xmm3=(all 0's)
    283 	movdqa    xmm4,xmm0
    284 	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    285 	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    286 	movdqa    xmm5,xmm1
    287 	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    288 	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    289 	movdqa    xmm6,xmm2
    290 	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    291 	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    292 
    293 	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
    294 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
    295 
    296 	pcmpeqb	xmm7,xmm7
    297 	psrldq	xmm7,(SIZEOF_XMMWORD-2)
    298 
    299 	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    300 	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    301 	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    302 	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    303 
    304 	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
    305 	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
    306 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
    307 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
    308 
    309 	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
    310 	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
    311 
    312 	movdqa	XMMWORD [wk(0)], xmm1
    313 	movdqa	XMMWORD [wk(1)], xmm2
    314 
    315 	poppic	ebx
    316 
    317 	add	eax, byte SIZEOF_XMMWORD-1
    318 	and	eax, byte -SIZEOF_XMMWORD
    319 	cmp	eax, byte SIZEOF_XMMWORD
    320 	ja	short .columnloop
    321 	alignx	16,7
    322 
    323 .columnloop_last:
    324 	; -- process the last column block
    325 
    326 	pushpic	ebx
    327 	movpic	ebx, POINTER [gotptr]	; load GOT address
    328 
    329 	pcmpeqb	xmm1,xmm1
    330 	pslldq	xmm1,(SIZEOF_XMMWORD-2)
    331 	movdqa	xmm2,xmm1
    332 
    333 	pand	xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
    334 	pand	xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
    335 
    336 	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
    337 	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
    338 
    339 	jmp	near .upsample
    340 	alignx	16,7
    341 
    342 .columnloop:
    343 	; -- process the next column block
    344 
    345 	movdqa	xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
    346 	movdqa	xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
    347 	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
    348 
    349 	pushpic	ebx
    350 	movpic	ebx, POINTER [gotptr]	; load GOT address
    351 
    352 	pxor      xmm3,xmm3		; xmm3=(all 0's)
    353 	movdqa    xmm4,xmm0
    354 	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    355 	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    356 	movdqa    xmm5,xmm1
    357 	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    358 	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    359 	movdqa    xmm6,xmm2
    360 	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    361 	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    362 
    363 	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
    364 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
    365 
    366 	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    367 	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    368 	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    369 	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    370 
    371 	movdqa	XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
    372 	movdqa	XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
    373 	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
    374 	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
    375 
    376 	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
    377 	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
    378 
    379 	movdqa	XMMWORD [wk(2)], xmm1
    380 	movdqa	XMMWORD [wk(3)], xmm2
    381 
    382 .upsample:
    383 	; -- process the upper row
    384 
    385 	movdqa	xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
    386 	movdqa	xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
    387 
    388 	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
    389 	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
    390 	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
    391 	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
    392 	movdqa	xmm5,xmm7
    393 	movdqa	xmm6,xmm3
    394 	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
    395 	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
    396 
    397 	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
    398 	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
    399 
    400 	movdqa	xmm1,xmm7
    401 	movdqa	xmm2,xmm3
    402 	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
    403 	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
    404 	movdqa	xmm4,xmm3
    405 	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
    406 
    407 	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
    408 	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
    409 
    410 	movdqa	XMMWORD [wk(0)], xmm4
    411 
    412 	pmullw	xmm7,[GOTOFF(ebx,PW_THREE)]
    413 	pmullw	xmm3,[GOTOFF(ebx,PW_THREE)]
    414 	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
    415 	paddw	xmm5,[GOTOFF(ebx,PW_EIGHT)]
    416 	paddw	xmm0,[GOTOFF(ebx,PW_SEVEN)]
    417 	paddw	xmm2,[GOTOFF(ebx,PW_SEVEN)]
    418 
    419 	paddw	xmm1,xmm7
    420 	paddw	xmm5,xmm3
    421 	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
    422 	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
    423 	paddw	xmm0,xmm7
    424 	paddw	xmm2,xmm3
    425 	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
    426 	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
    427 
    428 	psllw	xmm0,BYTE_BIT
    429 	psllw	xmm2,BYTE_BIT
    430 	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
    431 	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
    432 
    433 	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
    434 	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
    435 
    436 	; -- process the lower row
    437 
    438 	movdqa	xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
    439 	movdqa	xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
    440 
    441 	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
    442 	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
    443 	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
    444 	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
    445 	movdqa	xmm0,xmm6
    446 	movdqa	xmm2,xmm4
    447 	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
    448 	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
    449 
    450 	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
    451 	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
    452 
    453 	movdqa	xmm1,xmm6
    454 	movdqa	xmm5,xmm4
    455 	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
    456 	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
    457 	movdqa	xmm3,xmm4
    458 	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
    459 
    460 	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
    461 	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
    462 
    463 	movdqa	XMMWORD [wk(1)], xmm3
    464 
    465 	pmullw	xmm6,[GOTOFF(ebx,PW_THREE)]
    466 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
    467 	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
    468 	paddw	xmm0,[GOTOFF(ebx,PW_EIGHT)]
    469 	paddw	xmm7,[GOTOFF(ebx,PW_SEVEN)]
    470 	paddw	xmm5,[GOTOFF(ebx,PW_SEVEN)]
    471 
    472 	paddw	xmm1,xmm6
    473 	paddw	xmm0,xmm4
    474 	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
    475 	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
    476 	paddw	xmm7,xmm6
    477 	paddw	xmm5,xmm4
    478 	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
    479 	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
    480 
    481 	psllw	xmm7,BYTE_BIT
    482 	psllw	xmm5,BYTE_BIT
    483 	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
    484 	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
    485 
    486 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
    487 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
    488 
    489 	poppic	ebx
    490 
    491 	sub	eax, byte SIZEOF_XMMWORD
    492 	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
    493 	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0
    494 	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
    495 	add	edx, byte 2*SIZEOF_XMMWORD	; outptr0
    496 	add	edi, byte 2*SIZEOF_XMMWORD	; outptr1
    497 	cmp	eax, byte SIZEOF_XMMWORD
    498 	ja	near .columnloop
    499 	test	eax,eax
    500 	jnz	near .columnloop_last
    501 
    502 	pop	esi
    503 	pop	edi
    504 	pop	ecx
    505 	pop	eax
    506 
    507 	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
    508 	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
    509 	sub	ecx, byte 2			; rowctr
    510 	jg	near .rowloop
    511 
    512 .return:
    513 	pop	edi
    514 	pop	esi
    515 ;	pop	edx		; need not be preserved
    516 ;	pop	ecx		; need not be preserved
    517 	pop	ebx
    518 	mov	esp,ebp		; esp <- aligned ebp
    519 	pop	esp		; esp <- original ebp
    520 	pop	ebp
    521 	ret
    522 
    523 ; --------------------------------------------------------------------------
    524 ;
    525 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
    526 ; It's still a box filter.
    527 ;
    528 ; GLOBAL(void)
    529 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
    530 ;                           JDIMENSION output_width,
    531 ;                           JSAMPARRAY input_data,
    532 ;                           JSAMPARRAY * output_data_ptr);
    533 ;
    534 
    535 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
    536 %define output_width(b)	(b)+12		; JDIMENSION output_width
    537 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
    538 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    539 
    540 	align	16
    541 	global	EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
    542 
    543 EXTN(jsimd_h2v1_upsample_sse2):
    544 	push	ebp
    545 	mov	ebp,esp
    546 ;	push	ebx		; unused
    547 ;	push	ecx		; need not be preserved
    548 ;	push	edx		; need not be preserved
    549 	push	esi
    550 	push	edi
    551 
    552 	mov	edx, JDIMENSION [output_width(ebp)]
    553 	add	edx, byte (2*SIZEOF_XMMWORD)-1
    554 	and	edx, byte -(2*SIZEOF_XMMWORD)
    555 	jz	short .return
    556 
    557 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
    558 	test	ecx,ecx
    559 	jz	short .return
    560 
    561 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    562 	mov	edi, POINTER [output_data_ptr(ebp)]
    563 	mov	edi, JSAMPARRAY [edi]			; output_data
    564 	alignx	16,7
    565 .rowloop:
    566 	push	edi
    567 	push	esi
    568 
    569 	mov	esi, JSAMPROW [esi]		; inptr
    570 	mov	edi, JSAMPROW [edi]		; outptr
    571 	mov	eax,edx				; colctr
    572 	alignx	16,7
    573 .columnloop:
    574 
    575 	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    576 
    577 	movdqa    xmm1,xmm0
    578 	punpcklbw xmm0,xmm0
    579 	punpckhbw xmm1,xmm1
    580 
    581 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    582 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
    583 
    584 	sub	eax, byte 2*SIZEOF_XMMWORD
    585 	jz	short .nextrow
    586 
    587 	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
    588 
    589 	movdqa    xmm3,xmm2
    590 	punpcklbw xmm2,xmm2
    591 	punpckhbw xmm3,xmm3
    592 
    593 	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
    594 	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
    595 
    596 	sub	eax, byte 2*SIZEOF_XMMWORD
    597 	jz	short .nextrow
    598 
    599 	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
    600 	add	edi, byte 4*SIZEOF_XMMWORD	; outptr
    601 	jmp	short .columnloop
    602 	alignx	16,7
    603 
    604 .nextrow:
    605 	pop	esi
    606 	pop	edi
    607 
    608 	add	esi, byte SIZEOF_JSAMPROW	; input_data
    609 	add	edi, byte SIZEOF_JSAMPROW	; output_data
    610 	dec	ecx				; rowctr
    611 	jg	short .rowloop
    612 
    613 .return:
    614 	pop	edi
    615 	pop	esi
    616 ;	pop	edx		; need not be preserved
    617 ;	pop	ecx		; need not be preserved
    618 ;	pop	ebx		; unused
    619 	pop	ebp
    620 	ret
    621 
    622 ; --------------------------------------------------------------------------
    623 ;
    624 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
    625 ; It's still a box filter.
    626 ;
    627 ; GLOBAL(void)
    628 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
    629 ;                           JDIMENSION output_width,
    630 ;                           JSAMPARRAY input_data,
    631 ;                           JSAMPARRAY * output_data_ptr);
    632 ;
    633 
    634 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
    635 %define output_width(b)	(b)+12		; JDIMENSION output_width
    636 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
    637 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    638 
    639 	align	16
    640 	global	EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
    641 
    642 EXTN(jsimd_h2v2_upsample_sse2):
    643 	push	ebp
    644 	mov	ebp,esp
    645 	push	ebx
    646 ;	push	ecx		; need not be preserved
    647 ;	push	edx		; need not be preserved
    648 	push	esi
    649 	push	edi
    650 
    651 	mov	edx, JDIMENSION [output_width(ebp)]
    652 	add	edx, byte (2*SIZEOF_XMMWORD)-1
    653 	and	edx, byte -(2*SIZEOF_XMMWORD)
    654 	jz	near .return
    655 
    656 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
    657 	test	ecx,ecx
    658 	jz	near .return
    659 
    660 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    661 	mov	edi, POINTER [output_data_ptr(ebp)]
    662 	mov	edi, JSAMPARRAY [edi]			; output_data
    663 	alignx	16,7
    664 .rowloop:
    665 	push	edi
    666 	push	esi
    667 
    668 	mov	esi, JSAMPROW [esi]			; inptr
    669 	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
    670 	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
    671 	mov	eax,edx					; colctr
    672 	alignx	16,7
    673 .columnloop:
    674 
    675 	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    676 
    677 	movdqa    xmm1,xmm0
    678 	punpcklbw xmm0,xmm0
    679 	punpckhbw xmm1,xmm1
    680 
    681 	movdqa	XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
    682 	movdqa	XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
    683 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    684 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
    685 
    686 	sub	eax, byte 2*SIZEOF_XMMWORD
    687 	jz	short .nextrow
    688 
    689 	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
    690 
    691 	movdqa    xmm3,xmm2
    692 	punpcklbw xmm2,xmm2
    693 	punpckhbw xmm3,xmm3
    694 
    695 	movdqa	XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
    696 	movdqa	XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
    697 	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
    698 	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
    699 
    700 	sub	eax, byte 2*SIZEOF_XMMWORD
    701 	jz	short .nextrow
    702 
    703 	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
    704 	add	ebx, byte 4*SIZEOF_XMMWORD	; outptr0
    705 	add	edi, byte 4*SIZEOF_XMMWORD	; outptr1
    706 	jmp	short .columnloop
    707 	alignx	16,7
    708 
    709 .nextrow:
    710 	pop	esi
    711 	pop	edi
    712 
    713 	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
    714 	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
    715 	sub	ecx, byte 2			; rowctr
    716 	jg	short .rowloop
    717 
    718 .return:
    719 	pop	edi
    720 	pop	esi
    721 ;	pop	edx		; need not be preserved
    722 ;	pop	ecx		; need not be preserved
    723 	pop	ebx
    724 	pop	ebp
    725 	ret
    726 
    727 ; For some reason, the OS X linker does not honor the request to align the
    728 ; segment unless we do this.
    729 	align	16
    730