Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcsamss2-64.asm - downsampling (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright 2009 D. R. Commander
      6 ;
      7 ; Based on
      8 ; x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler),
     13 ; can *not* be assembled with Microsoft's MASM or any compatible
     14 ; assembler (including Borland's Turbo Assembler).
     15 ; NASM is available from http://nasm.sourceforge.net/ or
     16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     17 ;
     18 ; [TAB8]
     19 
     20 %include "jsimdext.inc"
     21 
     22 ; --------------------------------------------------------------------------
     23 	SECTION	SEG_TEXT
     24 	BITS	64
     25 ;
     26 ; Downsample pixel values of a single component.
     27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     28 ; without smoothing.
     29 ;
     30 ; GLOBAL(void)
     31 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
     32 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
     33 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
     34 ;
     35 
     36 ; r10 = JDIMENSION image_width
     37 ; r11 = int max_v_samp_factor
     38 ; r12 = JDIMENSION v_samp_factor
     39 ; r13 = JDIMENSION width_blocks
     40 ; r14 = JSAMPARRAY input_data
     41 ; r15 = JSAMPARRAY output_data
     42 
     43 	align	16
     44 	global	EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
     45 
     46 EXTN(jsimd_h2v1_downsample_sse2):
     47 	push	rbp
     48 	mov	rax,rsp
     49 	mov	rbp,rsp
     50 	collect_args
     51 
     52 	mov rcx, r13
     53 	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
     54 	jz	near .return
     55 
     56 	mov rdx, r10
     57 
     58 	; -- expand_right_edge
     59 
     60 	push	rcx
     61 	shl	rcx,1				; output_cols * 2
     62 	sub	rcx,rdx
     63 	jle	short .expand_end
     64 
     65 	mov	rax, r11
     66 	test	rax,rax
     67 	jle	short .expand_end
     68 
     69 	cld
     70 	mov	rsi, r14	; input_data
     71 .expandloop:
     72 	push	rax
     73 	push	rcx
     74 
     75 	mov	rdi, JSAMPROW [rsi]
     76 	add	rdi,rdx
     77 	mov	al, JSAMPLE [rdi-1]
     78 
     79 	rep stosb
     80 
     81 	pop	rcx
     82 	pop	rax
     83 
     84 	add	rsi, byte SIZEOF_JSAMPROW
     85 	dec	rax
     86 	jg	short .expandloop
     87 
     88 .expand_end:
     89 	pop	rcx				; output_cols
     90 
     91 	; -- h2v1_downsample
     92 
     93 	mov	rax, r12	; rowctr
     94 	test	eax,eax
     95 	jle	near .return
     96 
     97 	mov	rdx, 0x00010000		; bias pattern
     98 	movd	xmm7,edx
     99 	pcmpeqw	xmm6,xmm6
    100 	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
    101 	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
    102 
    103 	mov	rsi, r14	; input_data
    104 	mov	rdi, r15	; output_data
    105 .rowloop:
    106 	push	rcx
    107 	push	rdi
    108 	push	rsi
    109 
    110 	mov	rsi, JSAMPROW [rsi]		; inptr
    111 	mov rdi, JSAMPROW [rdi]		; outptr
    112 
    113 	cmp	rcx, byte SIZEOF_XMMWORD
    114 	jae	short .columnloop
    115 
    116 .columnloop_r8:
    117 	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    118 	pxor	xmm1,xmm1
    119 	mov	rcx, SIZEOF_XMMWORD
    120 	jmp	short .downsample
    121 
    122 .columnloop:
    123 	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    124 	movdqa	xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    125 
    126 .downsample:
    127 	movdqa	xmm2,xmm0
    128 	movdqa	xmm3,xmm1
    129 
    130 	pand	xmm0,xmm6
    131 	psrlw	xmm2,BYTE_BIT
    132 	pand	xmm1,xmm6
    133 	psrlw	xmm3,BYTE_BIT
    134 
    135 	paddw	xmm0,xmm2
    136 	paddw	xmm1,xmm3
    137 	paddw	xmm0,xmm7
    138 	paddw	xmm1,xmm7
    139 	psrlw	xmm0,1
    140 	psrlw	xmm1,1
    141 
    142 	packuswb xmm0,xmm1
    143 
    144 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    145 
    146 	sub	rcx, byte SIZEOF_XMMWORD	; outcol
    147 	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
    148 	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
    149 	cmp	rcx, byte SIZEOF_XMMWORD
    150 	jae	short .columnloop
    151 	test	rcx,rcx
    152 	jnz	short .columnloop_r8
    153 
    154 	pop	rsi
    155 	pop	rdi
    156 	pop	rcx
    157 
    158 	add	rsi, byte SIZEOF_JSAMPROW	; input_data
    159 	add	rdi, byte SIZEOF_JSAMPROW	; output_data
    160 	dec	rax				; rowctr
    161 	jg	near .rowloop
    162 
    163 .return:
    164 	uncollect_args
    165 	pop	rbp
    166 	ret
    167 
    168 ; --------------------------------------------------------------------------
    169 ;
    170 ; Downsample pixel values of a single component.
    171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    172 ; without smoothing.
    173 ;
    174 ; GLOBAL(void)
    175 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
    176 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    177 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
    178 ;
    179 
    180 ; r10 = JDIMENSION image_width
    181 ; r11 = int max_v_samp_factor
    182 ; r12 = JDIMENSION v_samp_factor
    183 ; r13 = JDIMENSION width_blocks
    184 ; r14 = JSAMPARRAY input_data
    185 ; r15 = JSAMPARRAY output_data
    186 
    187 	align	16
    188 	global	EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
    189 
    190 EXTN(jsimd_h2v2_downsample_sse2):
    191 	push	rbp
    192 	mov	rax,rsp
    193 	mov	rbp,rsp
    194 	collect_args
    195 
    196 	mov	rcx, r13
    197 	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
    198 	jz	near .return
    199 
    200 	mov	rdx, r10
    201 
    202 	; -- expand_right_edge
    203 
    204 	push	rcx
    205 	shl	rcx,1				; output_cols * 2
    206 	sub	rcx,rdx
    207 	jle	short .expand_end
    208 
    209 	mov	rax, r11
    210 	test	rax,rax
    211 	jle	short .expand_end
    212 
    213 	cld
    214 	mov	rsi, r14	; input_data
    215 .expandloop:
    216 	push	rax
    217 	push	rcx
    218 
    219 	mov	rdi, JSAMPROW [rsi]
    220 	add	rdi,rdx
    221 	mov	al, JSAMPLE [rdi-1]
    222 
    223 	rep stosb
    224 
    225 	pop	rcx
    226 	pop	rax
    227 
    228 	add	rsi, byte SIZEOF_JSAMPROW
    229 	dec	rax
    230 	jg	short .expandloop
    231 
    232 .expand_end:
    233 	pop	rcx				; output_cols
    234 
    235 	; -- h2v2_downsample
    236 
    237 	mov	rax, r12	; rowctr
    238 	test	rax,rax
    239 	jle	near .return
    240 
    241 	mov	rdx, 0x00020001		; bias pattern
    242 	movd	xmm7,edx
    243 	pcmpeqw	xmm6,xmm6
    244 	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
    245 	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
    246 
    247 	mov	rsi, r14	; input_data
    248 	mov	rdi, r15	; output_data
    249 .rowloop:
    250 	push	rcx
    251 	push	rdi
    252 	push	rsi
    253 
    254 	mov	rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
    255 	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1
    256 	mov	rdi, JSAMPROW [rdi]			; outptr
    257 
    258 	cmp	rcx, byte SIZEOF_XMMWORD
    259 	jae	short .columnloop
    260 
    261 .columnloop_r8:
    262 	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    263 	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    264 	pxor	xmm2,xmm2
    265 	pxor	xmm3,xmm3
    266 	mov	rcx, SIZEOF_XMMWORD
    267 	jmp	short .downsample
    268 
    269 .columnloop:
    270 	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    271 	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    272 	movdqa	xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    273 	movdqa	xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    274 
    275 .downsample:
    276 	movdqa	xmm4,xmm0
    277 	movdqa	xmm5,xmm1
    278 	pand	xmm0,xmm6
    279 	psrlw	xmm4,BYTE_BIT
    280 	pand	xmm1,xmm6
    281 	psrlw	xmm5,BYTE_BIT
    282 	paddw	xmm0,xmm4
    283 	paddw	xmm1,xmm5
    284 
    285 	movdqa	xmm4,xmm2
    286 	movdqa	xmm5,xmm3
    287 	pand	xmm2,xmm6
    288 	psrlw	xmm4,BYTE_BIT
    289 	pand	xmm3,xmm6
    290 	psrlw	xmm5,BYTE_BIT
    291 	paddw	xmm2,xmm4
    292 	paddw	xmm3,xmm5
    293 
    294 	paddw	xmm0,xmm1
    295 	paddw	xmm2,xmm3
    296 	paddw	xmm0,xmm7
    297 	paddw	xmm2,xmm7
    298 	psrlw	xmm0,2
    299 	psrlw	xmm2,2
    300 
    301 	packuswb xmm0,xmm2
    302 
    303 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    304 
    305 	sub	rcx, byte SIZEOF_XMMWORD	; outcol
    306 	add	rdx, byte 2*SIZEOF_XMMWORD	; inptr0
    307 	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr1
    308 	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
    309 	cmp	rcx, byte SIZEOF_XMMWORD
    310 	jae	near .columnloop
    311 	test	rcx,rcx
    312 	jnz	near .columnloop_r8
    313 
    314 	pop	rsi
    315 	pop	rdi
    316 	pop	rcx
    317 
    318 	add	rsi, byte 2*SIZEOF_JSAMPROW	; input_data
    319 	add	rdi, byte 1*SIZEOF_JSAMPROW	; output_data
    320 	dec	rax				; rowctr
    321 	jg	near .rowloop
    322 
    323 .return:
    324 	uncollect_args
    325 	pop	rbp
    326 	ret
    327 
    328 ; For some reason, the OS X linker does not honor the request to align the
    329 ; segment unless we do this.
    330 	align	16
    331