Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jdsammmx.asm - upsampling (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 	SECTION	SEG_CONST
     23 
     24 	alignz	16
     25 	global	EXTN(jconst_fancy_upsample_mmx) PRIVATE
     26 
     27 EXTN(jconst_fancy_upsample_mmx):
     28 
     29 PW_ONE		times 4 dw  1
     30 PW_TWO		times 4 dw  2
     31 PW_THREE	times 4 dw  3
     32 PW_SEVEN	times 4 dw  7
     33 PW_EIGHT	times 4 dw  8
     34 
     35 	alignz	16
     36 
     37 ; --------------------------------------------------------------------------
     38 	SECTION	SEG_TEXT
     39 	BITS	32
     40 ;
     41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
     42 ;
     43 ; The upsampling algorithm is linear interpolation between pixel centers,
     44 ; also known as a "triangle filter".  This is a good compromise between
     45 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
     46 ; of the way between input pixel centers.
     47 ;
     48 ; GLOBAL(void)
     49 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
     50 ;                                JDIMENSION downsampled_width,
     51 ;                                JSAMPARRAY input_data,
     52 ;                                JSAMPARRAY * output_data_ptr);
     53 ;
     54 
     55 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
     56 %define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
     57 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
     58 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
     59 
     60 	align	16
     61 	global	EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE
     62 
     63 EXTN(jsimd_h2v1_fancy_upsample_mmx):
     64 	push	ebp
     65 	mov	ebp,esp
     66 	pushpic	ebx
     67 ;	push	ecx		; need not be preserved
     68 ;	push	edx		; need not be preserved
     69 	push	esi
     70 	push	edi
     71 
     72 	get_GOT	ebx		; get GOT address
     73 
     74 	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
     75 	test	eax,eax
     76 	jz	near .return
     77 
     78 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
     79 	test	ecx,ecx
     80 	jz	near .return
     81 
     82 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
     83 	mov	edi, POINTER [output_data_ptr(ebp)]
     84 	mov	edi, JSAMPARRAY [edi]			; output_data
     85 	alignx	16,7
     86 .rowloop:
     87 	push	eax			; colctr
     88 	push	edi
     89 	push	esi
     90 
     91 	mov	esi, JSAMPROW [esi]	; inptr
     92 	mov	edi, JSAMPROW [edi]	; outptr
     93 
     94 	test	eax, SIZEOF_MMWORD-1
     95 	jz	short .skip
     96 	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
     97 	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
     98 .skip:
     99 	pxor	mm0,mm0			; mm0=(all 0's)
    100 	pcmpeqb	mm7,mm7
    101 	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
    102 	pand	mm7, MMWORD [esi+0*SIZEOF_MMWORD]
    103 
    104 	add	eax, byte SIZEOF_MMWORD-1
    105 	and	eax, byte -SIZEOF_MMWORD
    106 	cmp	eax, byte SIZEOF_MMWORD
    107 	ja	short .columnloop
    108 	alignx	16,7
    109 
    110 .columnloop_last:
    111 	pcmpeqb	mm6,mm6
    112 	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
    113 	pand	mm6, MMWORD [esi+0*SIZEOF_MMWORD]
    114 	jmp	short .upsample
    115 	alignx	16,7
    116 
    117 .columnloop:
    118 	movq	mm6, MMWORD [esi+1*SIZEOF_MMWORD]
    119 	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
    120 
    121 .upsample:
    122 	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
    123 	movq	mm2,mm1
    124 	movq	mm3,mm1			; mm1=( 0 1 2 3 4 5 6 7)
    125 	psllq	mm2,BYTE_BIT		; mm2=( - 0 1 2 3 4 5 6)
    126 	psrlq	mm3,BYTE_BIT		; mm3=( 1 2 3 4 5 6 7 -)
    127 
    128 	por	mm2,mm7			; mm2=(-1 0 1 2 3 4 5 6)
    129 	por	mm3,mm6			; mm3=( 1 2 3 4 5 6 7 8)
    130 
    131 	movq	mm7,mm1
    132 	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT	; mm7=( 7 - - - - - - -)
    133 
    134 	movq      mm4,mm1
    135 	punpcklbw mm1,mm0		; mm1=( 0 1 2 3)
    136 	punpckhbw mm4,mm0		; mm4=( 4 5 6 7)
    137 	movq      mm5,mm2
    138 	punpcklbw mm2,mm0		; mm2=(-1 0 1 2)
    139 	punpckhbw mm5,mm0		; mm5=( 3 4 5 6)
    140 	movq      mm6,mm3
    141 	punpcklbw mm3,mm0		; mm3=( 1 2 3 4)
    142 	punpckhbw mm6,mm0		; mm6=( 5 6 7 8)
    143 
    144 	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
    145 	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
    146 	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
    147 	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
    148 	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
    149 	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
    150 
    151 	paddw	mm2,mm1
    152 	paddw	mm5,mm4
    153 	psrlw	mm2,2			; mm2=OutLE=( 0  2  4  6)
    154 	psrlw	mm5,2			; mm5=OutHE=( 8 10 12 14)
    155 	paddw	mm3,mm1
    156 	paddw	mm6,mm4
    157 	psrlw	mm3,2			; mm3=OutLO=( 1  3  5  7)
    158 	psrlw	mm6,2			; mm6=OutHO=( 9 11 13 15)
    159 
    160 	psllw	mm3,BYTE_BIT
    161 	psllw	mm6,BYTE_BIT
    162 	por	mm2,mm3			; mm2=OutL=( 0  1  2  3  4  5  6  7)
    163 	por	mm5,mm6			; mm5=OutH=( 8  9 10 11 12 13 14 15)
    164 
    165 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
    166 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm5
    167 
    168 	sub	eax, byte SIZEOF_MMWORD
    169 	add	esi, byte 1*SIZEOF_MMWORD	; inptr
    170 	add	edi, byte 2*SIZEOF_MMWORD	; outptr
    171 	cmp	eax, byte SIZEOF_MMWORD
    172 	ja	near .columnloop
    173 	test	eax,eax
    174 	jnz	near .columnloop_last
    175 
    176 	pop	esi
    177 	pop	edi
    178 	pop	eax
    179 
    180 	add	esi, byte SIZEOF_JSAMPROW	; input_data
    181 	add	edi, byte SIZEOF_JSAMPROW	; output_data
    182 	dec	ecx				; rowctr
    183 	jg	near .rowloop
    184 
    185 	emms		; empty MMX state
    186 
    187 .return:
    188 	pop	edi
    189 	pop	esi
    190 ;	pop	edx		; need not be preserved
    191 ;	pop	ecx		; need not be preserved
    192 	poppic	ebx
    193 	pop	ebp
    194 	ret
    195 
    196 ; --------------------------------------------------------------------------
    197 ;
    198 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    199 ; Again a triangle filter; see comments for h2v1 case, above.
    200 ;
    201 ; GLOBAL(void)
    202 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
    203 ;                                JDIMENSION downsampled_width,
    204 ;                                JSAMPARRAY input_data,
    205 ;                                JSAMPARRAY * output_data_ptr);
    206 ;
    207 
    208 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
    209 %define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
    210 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
    211 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    212 
    213 %define original_ebp	ebp+0
    214 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
    215 %define WK_NUM		4
    216 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
    217 
    218 	align	16
    219 	global	EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE
    220 
    221 EXTN(jsimd_h2v2_fancy_upsample_mmx):
    222 	push	ebp
    223 	mov	eax,esp				; eax = original ebp
    224 	sub	esp, byte 4
    225 	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
    226 	mov	[esp],eax
    227 	mov	ebp,esp				; ebp = aligned ebp
    228 	lea	esp, [wk(0)]
    229 	pushpic	eax		; make a room for GOT address
    230 	push	ebx
    231 ;	push	ecx		; need not be preserved
    232 ;	push	edx		; need not be preserved
    233 	push	esi
    234 	push	edi
    235 
    236 	get_GOT	ebx			; get GOT address
    237 	movpic	POINTER [gotptr], ebx	; save GOT address
    238 
    239 	mov	edx,eax				; edx = original ebp
    240 	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
    241 	test	eax,eax
    242 	jz	near .return
    243 
    244 	mov	ecx, INT [max_v_samp(edx)]	; rowctr
    245 	test	ecx,ecx
    246 	jz	near .return
    247 
    248 	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
    249 	mov	edi, POINTER [output_data_ptr(edx)]
    250 	mov	edi, JSAMPARRAY [edi]			; output_data
    251 	alignx	16,7
    252 .rowloop:
    253 	push	eax					; colctr
    254 	push	ecx
    255 	push	edi
    256 	push	esi
    257 
    258 	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
    259 	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
    260 	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
    261 	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
    262 	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
    263 
    264 	test	eax, SIZEOF_MMWORD-1
    265 	jz	short .skip
    266 	push	edx
    267 	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
    268 	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
    269 	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
    270 	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
    271 	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
    272 	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
    273 	pop	edx
    274 .skip:
    275 	; -- process the first column block
    276 
    277 	movq	mm0, MMWORD [ebx+0*SIZEOF_MMWORD]	; mm0=row[ 0][0]
    278 	movq	mm1, MMWORD [ecx+0*SIZEOF_MMWORD]	; mm1=row[-1][0]
    279 	movq	mm2, MMWORD [esi+0*SIZEOF_MMWORD]	; mm2=row[+1][0]
    280 
    281 	pushpic	ebx
    282 	movpic	ebx, POINTER [gotptr]	; load GOT address
    283 
    284 	pxor      mm3,mm3		; mm3=(all 0's)
    285 	movq      mm4,mm0
    286 	punpcklbw mm0,mm3		; mm0=row[ 0][0]( 0 1 2 3)
    287 	punpckhbw mm4,mm3		; mm4=row[ 0][0]( 4 5 6 7)
    288 	movq      mm5,mm1
    289 	punpcklbw mm1,mm3		; mm1=row[-1][0]( 0 1 2 3)
    290 	punpckhbw mm5,mm3		; mm5=row[-1][0]( 4 5 6 7)
    291 	movq      mm6,mm2
    292 	punpcklbw mm2,mm3		; mm2=row[+1][0]( 0 1 2 3)
    293 	punpckhbw mm6,mm3		; mm6=row[+1][0]( 4 5 6 7)
    294 
    295 	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
    296 	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
    297 
    298 	pcmpeqb	mm7,mm7
    299 	psrlq	mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
    300 
    301 	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
    302 	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
    303 	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
    304 	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
    305 
    306 	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1	; temporarily save
    307 	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5	; the intermediate data
    308 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
    309 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm6
    310 
    311 	pand	mm1,mm7			; mm1=( 0 - - -)
    312 	pand	mm2,mm7			; mm2=( 0 - - -)
    313 
    314 	movq	MMWORD [wk(0)], mm1
    315 	movq	MMWORD [wk(1)], mm2
    316 
    317 	poppic	ebx
    318 
    319 	add	eax, byte SIZEOF_MMWORD-1
    320 	and	eax, byte -SIZEOF_MMWORD
    321 	cmp	eax, byte SIZEOF_MMWORD
    322 	ja	short .columnloop
    323 	alignx	16,7
    324 
    325 .columnloop_last:
    326 	; -- process the last column block
    327 
    328 	pushpic	ebx
    329 	movpic	ebx, POINTER [gotptr]	; load GOT address
    330 
    331 	pcmpeqb	mm1,mm1
    332 	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
    333 	movq	mm2,mm1
    334 
    335 	pand	mm1, MMWORD [edx+1*SIZEOF_MMWORD]	; mm1=( - - - 7)
    336 	pand	mm2, MMWORD [edi+1*SIZEOF_MMWORD]	; mm2=( - - - 7)
    337 
    338 	movq	MMWORD [wk(2)], mm1
    339 	movq	MMWORD [wk(3)], mm2
    340 
    341 	jmp	short .upsample
    342 	alignx	16,7
    343 
    344 .columnloop:
    345 	; -- process the next column block
    346 
    347 	movq	mm0, MMWORD [ebx+1*SIZEOF_MMWORD]	; mm0=row[ 0][1]
    348 	movq	mm1, MMWORD [ecx+1*SIZEOF_MMWORD]	; mm1=row[-1][1]
    349 	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]	; mm2=row[+1][1]
    350 
    351 	pushpic	ebx
    352 	movpic	ebx, POINTER [gotptr]	; load GOT address
    353 
    354 	pxor      mm3,mm3		; mm3=(all 0's)
    355 	movq      mm4,mm0
    356 	punpcklbw mm0,mm3		; mm0=row[ 0][1]( 0 1 2 3)
    357 	punpckhbw mm4,mm3		; mm4=row[ 0][1]( 4 5 6 7)
    358 	movq      mm5,mm1
    359 	punpcklbw mm1,mm3		; mm1=row[-1][1]( 0 1 2 3)
    360 	punpckhbw mm5,mm3		; mm5=row[-1][1]( 4 5 6 7)
    361 	movq      mm6,mm2
    362 	punpcklbw mm2,mm3		; mm2=row[+1][1]( 0 1 2 3)
    363 	punpckhbw mm6,mm3		; mm6=row[+1][1]( 4 5 6 7)
    364 
    365 	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
    366 	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
    367 
    368 	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
    369 	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
    370 	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
    371 	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
    372 
    373 	movq	MMWORD [edx+2*SIZEOF_MMWORD], mm1	; temporarily save
    374 	movq	MMWORD [edx+3*SIZEOF_MMWORD], mm5	; the intermediate data
    375 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
    376 	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm6
    377 
    378 	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm1=( - - - 0)
    379 	psllq	mm2,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm2=( - - - 0)
    380 
    381 	movq	MMWORD [wk(2)], mm1
    382 	movq	MMWORD [wk(3)], mm2
    383 
    384 .upsample:
    385 	; -- process the upper row
    386 
    387 	movq	mm7, MMWORD [edx+0*SIZEOF_MMWORD]	; mm7=Int0L=( 0 1 2 3)
    388 	movq	mm3, MMWORD [edx+1*SIZEOF_MMWORD]	; mm3=Int0H=( 4 5 6 7)
    389 
    390 	movq	mm0,mm7
    391 	movq	mm4,mm3
    392 	psrlq	mm0,2*BYTE_BIT			; mm0=( 1 2 3 -)
    393 	psllq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( - - - 4)
    394 	movq	mm5,mm7
    395 	movq	mm6,mm3
    396 	psrlq	mm5,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm5=( 3 - - -)
    397 	psllq	mm6,2*BYTE_BIT			; mm6=( - 4 5 6)
    398 
    399 	por	mm0,mm4				; mm0=( 1 2 3 4)
    400 	por	mm5,mm6				; mm5=( 3 4 5 6)
    401 
    402 	movq	mm1,mm7
    403 	movq	mm2,mm3
    404 	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
    405 	psrlq	mm2,2*BYTE_BIT			; mm2=( 5 6 7 -)
    406 	movq	mm4,mm3
    407 	psrlq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( 7 - - -)
    408 
    409 	por	mm1, MMWORD [wk(0)]		; mm1=(-1 0 1 2)
    410 	por	mm2, MMWORD [wk(2)]		; mm2=( 5 6 7 8)
    411 
    412 	movq	MMWORD [wk(0)], mm4
    413 
    414 	pmullw	mm7,[GOTOFF(ebx,PW_THREE)]
    415 	pmullw	mm3,[GOTOFF(ebx,PW_THREE)]
    416 	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
    417 	paddw	mm5,[GOTOFF(ebx,PW_EIGHT)]
    418 	paddw	mm0,[GOTOFF(ebx,PW_SEVEN)]
    419 	paddw	mm2,[GOTOFF(ebx,PW_SEVEN)]
    420 
    421 	paddw	mm1,mm7
    422 	paddw	mm5,mm3
    423 	psrlw	mm1,4			; mm1=Out0LE=( 0  2  4  6)
    424 	psrlw	mm5,4			; mm5=Out0HE=( 8 10 12 14)
    425 	paddw	mm0,mm7
    426 	paddw	mm2,mm3
    427 	psrlw	mm0,4			; mm0=Out0LO=( 1  3  5  7)
    428 	psrlw	mm2,4			; mm2=Out0HO=( 9 11 13 15)
    429 
    430 	psllw	mm0,BYTE_BIT
    431 	psllw	mm2,BYTE_BIT
    432 	por	mm1,mm0			; mm1=Out0L=( 0  1  2  3  4  5  6  7)
    433 	por	mm5,mm2			; mm5=Out0H=( 8  9 10 11 12 13 14 15)
    434 
    435 	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1
    436 	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5
    437 
    438 	; -- process the lower row
    439 
    440 	movq	mm6, MMWORD [edi+0*SIZEOF_MMWORD]	; mm6=Int1L=( 0 1 2 3)
    441 	movq	mm4, MMWORD [edi+1*SIZEOF_MMWORD]	; mm4=Int1H=( 4 5 6 7)
    442 
    443 	movq	mm7,mm6
    444 	movq	mm3,mm4
    445 	psrlq	mm7,2*BYTE_BIT			; mm7=( 1 2 3 -)
    446 	psllq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( - - - 4)
    447 	movq	mm0,mm6
    448 	movq	mm2,mm4
    449 	psrlq	mm0,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm0=( 3 - - -)
    450 	psllq	mm2,2*BYTE_BIT			; mm2=( - 4 5 6)
    451 
    452 	por	mm7,mm3				; mm7=( 1 2 3 4)
    453 	por	mm0,mm2				; mm0=( 3 4 5 6)
    454 
    455 	movq	mm1,mm6
    456 	movq	mm5,mm4
    457 	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
    458 	psrlq	mm5,2*BYTE_BIT			; mm5=( 5 6 7 -)
    459 	movq	mm3,mm4
    460 	psrlq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( 7 - - -)
    461 
    462 	por	mm1, MMWORD [wk(1)]		; mm1=(-1 0 1 2)
    463 	por	mm5, MMWORD [wk(3)]		; mm5=( 5 6 7 8)
    464 
    465 	movq	MMWORD [wk(1)], mm3
    466 
    467 	pmullw	mm6,[GOTOFF(ebx,PW_THREE)]
    468 	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
    469 	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
    470 	paddw	mm0,[GOTOFF(ebx,PW_EIGHT)]
    471 	paddw	mm7,[GOTOFF(ebx,PW_SEVEN)]
    472 	paddw	mm5,[GOTOFF(ebx,PW_SEVEN)]
    473 
    474 	paddw	mm1,mm6
    475 	paddw	mm0,mm4
    476 	psrlw	mm1,4			; mm1=Out1LE=( 0  2  4  6)
    477 	psrlw	mm0,4			; mm0=Out1HE=( 8 10 12 14)
    478 	paddw	mm7,mm6
    479 	paddw	mm5,mm4
    480 	psrlw	mm7,4			; mm7=Out1LO=( 1  3  5  7)
    481 	psrlw	mm5,4			; mm5=Out1HO=( 9 11 13 15)
    482 
    483 	psllw	mm7,BYTE_BIT
    484 	psllw	mm5,BYTE_BIT
    485 	por	mm1,mm7			; mm1=Out1L=( 0  1  2  3  4  5  6  7)
    486 	por	mm0,mm5			; mm0=Out1H=( 8  9 10 11 12 13 14 15)
    487 
    488 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm1
    489 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm0
    490 
    491 	poppic	ebx
    492 
    493 	sub	eax, byte SIZEOF_MMWORD
    494 	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
    495 	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
    496 	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
    497 	add	edx, byte 2*SIZEOF_MMWORD	; outptr0
    498 	add	edi, byte 2*SIZEOF_MMWORD	; outptr1
    499 	cmp	eax, byte SIZEOF_MMWORD
    500 	ja	near .columnloop
    501 	test	eax,eax
    502 	jnz	near .columnloop_last
    503 
    504 	pop	esi
    505 	pop	edi
    506 	pop	ecx
    507 	pop	eax
    508 
    509 	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
    510 	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
    511 	sub	ecx, byte 2			; rowctr
    512 	jg	near .rowloop
    513 
    514 	emms		; empty MMX state
    515 
    516 .return:
    517 	pop	edi
    518 	pop	esi
    519 ;	pop	edx		; need not be preserved
    520 ;	pop	ecx		; need not be preserved
    521 	pop	ebx
    522 	mov	esp,ebp		; esp <- aligned ebp
    523 	pop	esp		; esp <- original ebp
    524 	pop	ebp
    525 	ret
    526 
    527 ; --------------------------------------------------------------------------
    528 ;
    529 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
    530 ; It's still a box filter.
    531 ;
    532 ; GLOBAL(void)
    533 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
    534 ;                          JDIMENSION output_width,
    535 ;                          JSAMPARRAY input_data,
    536 ;                          JSAMPARRAY * output_data_ptr);
    537 ;
    538 
    539 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
    540 %define output_width(b)	(b)+12		; JDIMENSION output_width
    541 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
    542 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    543 
    544 	align	16
    545 	global	EXTN(jsimd_h2v1_upsample_mmx) PRIVATE
    546 
    547 EXTN(jsimd_h2v1_upsample_mmx):
    548 	push	ebp
    549 	mov	ebp,esp
    550 ;	push	ebx		; unused
    551 ;	push	ecx		; need not be preserved
    552 ;	push	edx		; need not be preserved
    553 	push	esi
    554 	push	edi
    555 
    556 	mov	edx, JDIMENSION [output_width(ebp)]
    557 	add	edx, byte (2*SIZEOF_MMWORD)-1
    558 	and	edx, byte -(2*SIZEOF_MMWORD)
    559 	jz	short .return
    560 
    561 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
    562 	test	ecx,ecx
    563 	jz	short .return
    564 
    565 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    566 	mov	edi, POINTER [output_data_ptr(ebp)]
    567 	mov	edi, JSAMPARRAY [edi]			; output_data
    568 	alignx	16,7
    569 .rowloop:
    570 	push	edi
    571 	push	esi
    572 
    573 	mov	esi, JSAMPROW [esi]		; inptr
    574 	mov	edi, JSAMPROW [edi]		; outptr
    575 	mov	eax,edx				; colctr
    576 	alignx	16,7
    577 .columnloop:
    578 
    579 	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    580 
    581 	movq      mm1,mm0
    582 	punpcklbw mm0,mm0
    583 	punpckhbw mm1,mm1
    584 
    585 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
    586 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
    587 
    588 	sub	eax, byte 2*SIZEOF_MMWORD
    589 	jz	short .nextrow
    590 
    591 	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
    592 
    593 	movq      mm3,mm2
    594 	punpcklbw mm2,mm2
    595 	punpckhbw mm3,mm3
    596 
    597 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
    598 	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
    599 
    600 	sub	eax, byte 2*SIZEOF_MMWORD
    601 	jz	short .nextrow
    602 
    603 	add	esi, byte 2*SIZEOF_MMWORD	; inptr
    604 	add	edi, byte 4*SIZEOF_MMWORD	; outptr
    605 	jmp	short .columnloop
    606 	alignx	16,7
    607 
    608 .nextrow:
    609 	pop	esi
    610 	pop	edi
    611 
    612 	add	esi, byte SIZEOF_JSAMPROW	; input_data
    613 	add	edi, byte SIZEOF_JSAMPROW	; output_data
    614 	dec	ecx				; rowctr
    615 	jg	short .rowloop
    616 
    617 	emms		; empty MMX state
    618 
    619 .return:
    620 	pop	edi
    621 	pop	esi
    622 ;	pop	edx		; need not be preserved
    623 ;	pop	ecx		; need not be preserved
    624 ;	pop	ebx		; unused
    625 	pop	ebp
    626 	ret
    627 
    628 ; --------------------------------------------------------------------------
    629 ;
    630 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
    631 ; It's still a box filter.
    632 ;
    633 ; GLOBAL(void)
    634 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
    635 ;                          JDIMENSION output_width,
    636 ;                          JSAMPARRAY input_data,
    637 ;                          JSAMPARRAY * output_data_ptr);
    638 ;
    639 
    640 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
    641 %define output_width(b)	(b)+12		; JDIMENSION output_width
    642 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
    643 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    644 
    645 	align	16
    646 	global	EXTN(jsimd_h2v2_upsample_mmx) PRIVATE
    647 
    648 EXTN(jsimd_h2v2_upsample_mmx):
    649 	push	ebp
    650 	mov	ebp,esp
    651 	push	ebx
    652 ;	push	ecx		; need not be preserved
    653 ;	push	edx		; need not be preserved
    654 	push	esi
    655 	push	edi
    656 
    657 	mov	edx, JDIMENSION [output_width(ebp)]
    658 	add	edx, byte (2*SIZEOF_MMWORD)-1
    659 	and	edx, byte -(2*SIZEOF_MMWORD)
    660 	jz	near .return
    661 
    662 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
    663 	test	ecx,ecx
    664 	jz	short .return
    665 
    666 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    667 	mov	edi, POINTER [output_data_ptr(ebp)]
    668 	mov	edi, JSAMPARRAY [edi]			; output_data
    669 	alignx	16,7
    670 .rowloop:
    671 	push	edi
    672 	push	esi
    673 
    674 	mov	esi, JSAMPROW [esi]			; inptr
    675 	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
    676 	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
    677 	mov	eax,edx					; colctr
    678 	alignx	16,7
    679 .columnloop:
    680 
    681 	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    682 
    683 	movq      mm1,mm0
    684 	punpcklbw mm0,mm0
    685 	punpckhbw mm1,mm1
    686 
    687 	movq	MMWORD [ebx+0*SIZEOF_MMWORD], mm0
    688 	movq	MMWORD [ebx+1*SIZEOF_MMWORD], mm1
    689 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
    690 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
    691 
    692 	sub	eax, byte 2*SIZEOF_MMWORD
    693 	jz	short .nextrow
    694 
    695 	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
    696 
    697 	movq      mm3,mm2
    698 	punpcklbw mm2,mm2
    699 	punpckhbw mm3,mm3
    700 
    701 	movq	MMWORD [ebx+2*SIZEOF_MMWORD], mm2
    702 	movq	MMWORD [ebx+3*SIZEOF_MMWORD], mm3
    703 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
    704 	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
    705 
    706 	sub	eax, byte 2*SIZEOF_MMWORD
    707 	jz	short .nextrow
    708 
    709 	add	esi, byte 2*SIZEOF_MMWORD	; inptr
    710 	add	ebx, byte 4*SIZEOF_MMWORD	; outptr0
    711 	add	edi, byte 4*SIZEOF_MMWORD	; outptr1
    712 	jmp	short .columnloop
    713 	alignx	16,7
    714 
    715 .nextrow:
    716 	pop	esi
    717 	pop	edi
    718 
    719 	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
    720 	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
    721 	sub	ecx, byte 2			; rowctr
    722 	jg	short .rowloop
    723 
    724 	emms		; empty MMX state
    725 
    726 .return:
    727 	pop	edi
    728 	pop	esi
    729 ;	pop	edx		; need not be preserved
    730 ;	pop	ecx		; need not be preserved
    731 	pop	ebx
    732 	pop	ebp
    733 	ret
    734 
    735 ; For some reason, the OS X linker does not honor the request to align the
    736 ; segment unless we do this.
    737 	align	16
    738