Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jdsamss2-64.asm - upsampling (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright 2009 D. R. Commander
      6 ;
      7 ; Based on
      8 ; x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler),
     13 ; can *not* be assembled with Microsoft's MASM or any compatible
     14 ; assembler (including Borland's Turbo Assembler).
     15 ; NASM is available from http://nasm.sourceforge.net/ or
     16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     17 ;
     18 ; [TAB8]
     19 
     20 %include "jsimdext.inc"
     21 
     22 ; --------------------------------------------------------------------------
     23 	SECTION	SEG_CONST
     24 
     25 	alignz	16
     26 	global	EXTN(jconst_fancy_upsample_sse2) PRIVATE
     27 
     28 EXTN(jconst_fancy_upsample_sse2):
     29 
     30 PW_ONE		times 8 dw  1
     31 PW_TWO		times 8 dw  2
     32 PW_THREE	times 8 dw  3
     33 PW_SEVEN	times 8 dw  7
     34 PW_EIGHT	times 8 dw  8
     35 
     36 	alignz	16
     37 
     38 ; --------------------------------------------------------------------------
     39 	SECTION	SEG_TEXT
     40 	BITS	64
     41 ;
     42 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
     43 ;
     44 ; The upsampling algorithm is linear interpolation between pixel centers,
     45 ; also known as a "triangle filter".  This is a good compromise between
     46 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
     47 ; of the way between input pixel centers.
     48 ;
     49 ; GLOBAL(void)
     50 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
     51 ;                                 JDIMENSION downsampled_width,
     52 ;                                 JSAMPARRAY input_data,
     53 ;                                 JSAMPARRAY * output_data_ptr);
     54 ;
     55 
     56 ; r10 = int max_v_samp_factor
     57 ; r11 = JDIMENSION downsampled_width
     58 ; r12 = JSAMPARRAY input_data
     59 ; r13 = JSAMPARRAY * output_data_ptr
     60 
     61 	align	16
     62 	global	EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
     63 
     64 EXTN(jsimd_h2v1_fancy_upsample_sse2):
     65 	push	rbp
     66 	mov	rax,rsp
     67 	mov	rbp,rsp
     68 	collect_args
     69 
     70 	mov	rax, r11  ; colctr
     71 	test	rax,rax
     72 	jz	near .return
     73 
     74 	mov	rcx, r10	; rowctr
     75 	test	rcx,rcx
     76 	jz	near .return
     77 
     78 	mov	rsi, r12	; input_data
     79 	mov	rdi, r13
     80 	mov	rdi, JSAMPARRAY [rdi]			; output_data
     81 .rowloop:
     82 	push	rax			; colctr
     83 	push	rdi
     84 	push	rsi
     85 
     86 	mov	rsi, JSAMPROW [rsi]	; inptr
     87 	mov	rdi, JSAMPROW [rdi]	; outptr
     88 
     89 	test	rax, SIZEOF_XMMWORD-1
     90 	jz	short .skip
     91 	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
     92 	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
     93 .skip:
     94 	pxor	xmm0,xmm0		; xmm0=(all 0's)
     95 	pcmpeqb	xmm7,xmm7
     96 	psrldq	xmm7,(SIZEOF_XMMWORD-1)
     97 	pand	xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
     98 
     99 	add	rax, byte SIZEOF_XMMWORD-1
    100 	and	rax, byte -SIZEOF_XMMWORD
    101 	cmp	rax, byte SIZEOF_XMMWORD
    102 	ja	short .columnloop
    103 
    104 .columnloop_last:
    105 	pcmpeqb	xmm6,xmm6
    106 	pslldq	xmm6,(SIZEOF_XMMWORD-1)
    107 	pand	xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    108 	jmp	short .upsample
    109 
    110 .columnloop:
    111 	movdqa	xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    112 	pslldq	xmm6,(SIZEOF_XMMWORD-1)
    113 
    114 .upsample:
    115 	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    116 	movdqa	xmm2,xmm1
    117 	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
    118 	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
    119 	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
    120 
    121 	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
    122 	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
    123 
    124 	movdqa	xmm7,xmm1
    125 	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
    126 
    127 	movdqa    xmm4,xmm1
    128 	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
    129 	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
    130 	movdqa    xmm5,xmm2
    131 	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
    132 	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
    133 	movdqa    xmm6,xmm3
    134 	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
    135 	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
    136 
    137 	pmullw	xmm1,[rel PW_THREE]
    138 	pmullw	xmm4,[rel PW_THREE]
    139 	paddw	xmm2,[rel PW_ONE]
    140 	paddw	xmm5,[rel PW_ONE]
    141 	paddw	xmm3,[rel PW_TWO]
    142 	paddw	xmm6,[rel PW_TWO]
    143 
    144 	paddw	xmm2,xmm1
    145 	paddw	xmm5,xmm4
    146 	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
    147 	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
    148 	paddw	xmm3,xmm1
    149 	paddw	xmm6,xmm4
    150 	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
    151 	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
    152 
    153 	psllw	xmm3,BYTE_BIT
    154 	psllw	xmm6,BYTE_BIT
    155 	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
    156 	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
    157 
    158 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
    159 	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
    160 
    161 	sub	rax, byte SIZEOF_XMMWORD
    162 	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr
    163 	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
    164 	cmp	rax, byte SIZEOF_XMMWORD
    165 	ja	near .columnloop
    166 	test	eax,eax
    167 	jnz	near .columnloop_last
    168 
    169 	pop	rsi
    170 	pop	rdi
    171 	pop	rax
    172 
    173 	add	rsi, byte SIZEOF_JSAMPROW	; input_data
    174 	add	rdi, byte SIZEOF_JSAMPROW	; output_data
    175 	dec	rcx				; rowctr
    176 	jg	near .rowloop
    177 
    178 .return:
    179 	uncollect_args
    180 	pop	rbp
    181 	ret
    182 
    183 ; --------------------------------------------------------------------------
    184 ;
    185 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    186 ; Again a triangle filter; see comments for h2v1 case, above.
    187 ;
    188 ; GLOBAL(void)
    189 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
    190 ;                                 JDIMENSION downsampled_width,
    191 ;                                 JSAMPARRAY input_data,
    192 ;                                 JSAMPARRAY * output_data_ptr);
    193 ;
    194 
    195 ; r10 = int max_v_samp_factor
    196 ; r11 = JDIMENSION downsampled_width
    197 ; r12 = JSAMPARRAY input_data
    198 ; r13 = JSAMPARRAY * output_data_ptr
    199 
    200 %define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
    201 %define WK_NUM		4
    202 
    203 	align	16
    204 	global	EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
    205 
    206 EXTN(jsimd_h2v2_fancy_upsample_sse2):
    207 	push	rbp
    208 	mov	rax,rsp				; rax = original rbp
    209 	sub	rsp, byte 4
    210 	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
    211 	mov	[rsp],rax
    212 	mov	rbp,rsp				; rbp = aligned rbp
    213 	lea	rsp, [wk(0)]
    214 	collect_args
    215 	push	rbx
    216 
    217 	mov	rax, r11  ; colctr
    218 	test	rax,rax
    219 	jz	near .return
    220 
    221 	mov	rcx, r10	; rowctr
    222 	test	rcx,rcx
    223 	jz	near .return
    224 
    225 	mov	rsi, r12	; input_data
    226 	mov	rdi, r13
    227 	mov	rdi, JSAMPARRAY [rdi]			; output_data
    228 .rowloop:
    229 	push	rax					; colctr
    230 	push	rcx
    231 	push	rdi
    232 	push	rsi
    233 
    234 	mov	rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]	; inptr1(above)
    235 	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
    236 	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1(below)
    237 	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
    238 	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
    239 
    240 	test	rax, SIZEOF_XMMWORD-1
    241 	jz	short .skip
    242 	push	rdx
    243 	mov	dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
    244 	mov	JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
    245 	mov	dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
    246 	mov	JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
    247 	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
    248 	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
    249 	pop	rdx
    250 .skip:
    251 	; -- process the first column block
    252 
    253 	movdqa	xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
    254 	movdqa	xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
    255 	movdqa	xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
    256 
    257 	pxor      xmm3,xmm3		; xmm3=(all 0's)
    258 	movdqa    xmm4,xmm0
    259 	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    260 	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    261 	movdqa    xmm5,xmm1
    262 	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    263 	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    264 	movdqa    xmm6,xmm2
    265 	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    266 	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    267 
    268 	pmullw	xmm0,[rel PW_THREE]
    269 	pmullw	xmm4,[rel PW_THREE]
    270 
    271 	pcmpeqb	xmm7,xmm7
    272 	psrldq	xmm7,(SIZEOF_XMMWORD-2)
    273 
    274 	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    275 	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    276 	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    277 	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    278 
    279 	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
    280 	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
    281 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
    282 	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
    283 
    284 	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
    285 	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
    286 
    287 	movdqa	XMMWORD [wk(0)], xmm1
    288 	movdqa	XMMWORD [wk(1)], xmm2
    289 
    290 	add	rax, byte SIZEOF_XMMWORD-1
    291 	and	rax, byte -SIZEOF_XMMWORD
    292 	cmp	rax, byte SIZEOF_XMMWORD
    293 	ja	short .columnloop
    294 
    295 .columnloop_last:
    296 	; -- process the last column block
    297 
    298 	pcmpeqb	xmm1,xmm1
    299 	pslldq	xmm1,(SIZEOF_XMMWORD-2)
    300 	movdqa	xmm2,xmm1
    301 
    302 	pand	xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    303 	pand	xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
    304 
    305 	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
    306 	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
    307 
    308 	jmp	near .upsample
    309 
    310 .columnloop:
    311 	; -- process the next column block
    312 
    313 	movdqa	xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
    314 	movdqa	xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
    315 	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
    316 
    317 	pxor      xmm3,xmm3		; xmm3=(all 0's)
    318 	movdqa    xmm4,xmm0
    319 	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    320 	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    321 	movdqa    xmm5,xmm1
    322 	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    323 	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    324 	movdqa    xmm6,xmm2
    325 	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    326 	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    327 
    328 	pmullw	xmm0,[rel PW_THREE]
    329 	pmullw	xmm4,[rel PW_THREE]
    330 
    331 	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    332 	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    333 	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    334 	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    335 
    336 	movdqa	XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
    337 	movdqa	XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
    338 	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    339 	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
    340 
    341 	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
    342 	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
    343 
    344 	movdqa	XMMWORD [wk(2)], xmm1
    345 	movdqa	XMMWORD [wk(3)], xmm2
    346 
    347 .upsample:
    348 	; -- process the upper row
    349 
    350 	movdqa	xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    351 	movdqa	xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    352 
    353 	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
    354 	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
    355 	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
    356 	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
    357 	movdqa	xmm5,xmm7
    358 	movdqa	xmm6,xmm3
    359 	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
    360 	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
    361 
    362 	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
    363 	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
    364 
    365 	movdqa	xmm1,xmm7
    366 	movdqa	xmm2,xmm3
    367 	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
    368 	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
    369 	movdqa	xmm4,xmm3
    370 	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
    371 
    372 	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
    373 	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
    374 
    375 	movdqa	XMMWORD [wk(0)], xmm4
    376 
    377 	pmullw	xmm7,[rel PW_THREE]
    378 	pmullw	xmm3,[rel PW_THREE]
    379 	paddw	xmm1,[rel PW_EIGHT]
    380 	paddw	xmm5,[rel PW_EIGHT]
    381 	paddw	xmm0,[rel PW_SEVEN]
    382 	paddw	xmm2,[rel PW_SEVEN]
    383 
    384 	paddw	xmm1,xmm7
    385 	paddw	xmm5,xmm3
    386 	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
    387 	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
    388 	paddw	xmm0,xmm7
    389 	paddw	xmm2,xmm3
    390 	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
    391 	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
    392 
    393 	psllw	xmm0,BYTE_BIT
    394 	psllw	xmm2,BYTE_BIT
    395 	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
    396 	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
    397 
    398 	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
    399 	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
    400 
    401 	; -- process the lower row
    402 
    403 	movdqa	xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
    404 	movdqa	xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
    405 
    406 	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
    407 	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
    408 	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
    409 	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
    410 	movdqa	xmm0,xmm6
    411 	movdqa	xmm2,xmm4
    412 	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
    413 	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
    414 
    415 	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
    416 	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
    417 
    418 	movdqa	xmm1,xmm6
    419 	movdqa	xmm5,xmm4
    420 	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
    421 	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
    422 	movdqa	xmm3,xmm4
    423 	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
    424 
    425 	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
    426 	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
    427 
    428 	movdqa	XMMWORD [wk(1)], xmm3
    429 
    430 	pmullw	xmm6,[rel PW_THREE]
    431 	pmullw	xmm4,[rel PW_THREE]
    432 	paddw	xmm1,[rel PW_EIGHT]
    433 	paddw	xmm0,[rel PW_EIGHT]
    434 	paddw	xmm7,[rel PW_SEVEN]
    435 	paddw	xmm5,[rel PW_SEVEN]
    436 
    437 	paddw	xmm1,xmm6
    438 	paddw	xmm0,xmm4
    439 	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
    440 	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
    441 	paddw	xmm7,xmm6
    442 	paddw	xmm5,xmm4
    443 	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
    444 	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
    445 
    446 	psllw	xmm7,BYTE_BIT
    447 	psllw	xmm5,BYTE_BIT
    448 	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
    449 	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
    450 
    451 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
    452 	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
    453 
    454 	sub	rax, byte SIZEOF_XMMWORD
    455 	add	rcx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
    456 	add	rbx, byte 1*SIZEOF_XMMWORD	; inptr0
    457 	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
    458 	add	rdx, byte 2*SIZEOF_XMMWORD	; outptr0
    459 	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr1
    460 	cmp	rax, byte SIZEOF_XMMWORD
    461 	ja	near .columnloop
    462 	test	rax,rax
    463 	jnz	near .columnloop_last
    464 
    465 	pop	rsi
    466 	pop	rdi
    467 	pop	rcx
    468 	pop	rax
    469 
    470 	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
    471 	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
    472 	sub	rcx, byte 2			; rowctr
    473 	jg	near .rowloop
    474 
    475 .return:
    476 	pop	rbx
    477 	uncollect_args
    478 	mov	rsp,rbp		; rsp <- aligned rbp
    479 	pop	rsp		; rsp <- original rbp
    480 	pop	rbp
    481 	ret
    482 
    483 ; --------------------------------------------------------------------------
    484 ;
    485 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
    486 ; It's still a box filter.
    487 ;
    488 ; GLOBAL(void)
    489 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
    490 ;                           JDIMENSION output_width,
    491 ;                           JSAMPARRAY input_data,
    492 ;                           JSAMPARRAY * output_data_ptr);
    493 ;
    494 
    495 ; r10 = int max_v_samp_factor
    496 ; r11 = JDIMENSION output_width
    497 ; r12 = JSAMPARRAY input_data
    498 ; r13 = JSAMPARRAY * output_data_ptr
    499 
    500 	align	16
    501 	global	EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
    502 
    503 EXTN(jsimd_h2v1_upsample_sse2):
    504 	push	rbp
    505 	mov	rax,rsp
    506 	mov	rbp,rsp
    507 	collect_args
    508 
    509 	mov	rdx, r11
    510 	add	rdx, byte (2*SIZEOF_XMMWORD)-1
    511 	and	rdx, byte -(2*SIZEOF_XMMWORD)
    512 	jz	near .return
    513 
    514 	mov	rcx, r10	; rowctr
    515 	test	rcx,rcx
    516 	jz	short .return
    517 
    518 	mov	rsi, r12 ; input_data
    519 	mov	rdi, r13
    520 	mov	rdi, JSAMPARRAY [rdi]			; output_data
    521 .rowloop:
    522 	push	rdi
    523 	push	rsi
    524 
    525 	mov	rsi, JSAMPROW [rsi]		; inptr
    526 	mov	rdi, JSAMPROW [rdi]		; outptr
    527 	mov	rax,rdx				; colctr
    528 .columnloop:
    529 
    530 	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    531 
    532 	movdqa    xmm1,xmm0
    533 	punpcklbw xmm0,xmm0
    534 	punpckhbw xmm1,xmm1
    535 
    536 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    537 	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
    538 
    539 	sub	rax, byte 2*SIZEOF_XMMWORD
    540 	jz	short .nextrow
    541 
    542 	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    543 
    544 	movdqa    xmm3,xmm2
    545 	punpcklbw xmm2,xmm2
    546 	punpckhbw xmm3,xmm3
    547 
    548 	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    549 	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
    550 
    551 	sub	rax, byte 2*SIZEOF_XMMWORD
    552 	jz	short .nextrow
    553 
    554 	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
    555 	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr
    556 	jmp	short .columnloop
    557 
    558 .nextrow:
    559 	pop	rsi
    560 	pop	rdi
    561 
    562 	add	rsi, byte SIZEOF_JSAMPROW	; input_data
    563 	add	rdi, byte SIZEOF_JSAMPROW	; output_data
    564 	dec	rcx				; rowctr
    565 	jg	short .rowloop
    566 
    567 .return:
    568 	uncollect_args
    569 	pop	rbp
    570 	ret
    571 
    572 ; --------------------------------------------------------------------------
    573 ;
    574 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
    575 ; It's still a box filter.
    576 ;
    577 ; GLOBAL(void)
    578 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
    579 ;                           JDIMENSION output_width,
    580 ;                           JSAMPARRAY input_data,
    581 ;                           JSAMPARRAY * output_data_ptr);
    582 ;
    583 
    584 ; r10 = int max_v_samp_factor
    585 ; r11 = JDIMENSION output_width
    586 ; r12 = JSAMPARRAY input_data
    587 ; r13 = JSAMPARRAY * output_data_ptr
    588 
    589 	align	16
    590 	global	EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
    591 
    592 EXTN(jsimd_h2v2_upsample_sse2):
    593 	push	rbp
    594 	mov	rax,rsp
    595 	mov	rbp,rsp
    596 	collect_args
    597 	push	rbx
    598 
    599 	mov	rdx, r11
    600 	add	rdx, byte (2*SIZEOF_XMMWORD)-1
    601 	and	rdx, byte -(2*SIZEOF_XMMWORD)
    602 	jz	near .return
    603 
    604 	mov	rcx, r10	; rowctr
    605 	test	rcx,rcx
    606 	jz	near .return
    607 
    608 	mov	rsi, r12	; input_data
    609 	mov	rdi, r13
    610 	mov	rdi, JSAMPARRAY [rdi]			; output_data
    611 .rowloop:
    612 	push	rdi
    613 	push	rsi
    614 
    615 	mov	rsi, JSAMPROW [rsi]			; inptr
    616 	mov	rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
    617 	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
    618 	mov	rax,rdx					; colctr
    619 .columnloop:
    620 
    621 	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    622 
    623 	movdqa    xmm1,xmm0
    624 	punpcklbw xmm0,xmm0
    625 	punpckhbw xmm1,xmm1
    626 
    627 	movdqa	XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
    628 	movdqa	XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
    629 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    630 	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
    631 
    632 	sub	rax, byte 2*SIZEOF_XMMWORD
    633 	jz	short .nextrow
    634 
    635 	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    636 
    637 	movdqa    xmm3,xmm2
    638 	punpcklbw xmm2,xmm2
    639 	punpckhbw xmm3,xmm3
    640 
    641 	movdqa	XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
    642 	movdqa	XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
    643 	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    644 	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
    645 
    646 	sub	rax, byte 2*SIZEOF_XMMWORD
    647 	jz	short .nextrow
    648 
    649 	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
    650 	add	rbx, byte 4*SIZEOF_XMMWORD	; outptr0
    651 	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr1
    652 	jmp	short .columnloop
    653 
    654 .nextrow:
    655 	pop	rsi
    656 	pop	rdi
    657 
    658 	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
    659 	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
    660 	sub	rcx, byte 2			; rowctr
    661 	jg	near .rowloop
    662 
    663 .return:
    664 	pop	rbx
    665 	uncollect_args
    666 	pop	rbp
    667 	ret
    668 
    669 ; For some reason, the OS X linker does not honor the request to align the
    670 ; segment unless we do this.
    671 	align	16
    672