Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcsamss2.asm - downsampling (SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 	SECTION	SEG_TEXT
     23 	BITS	32
     24 ;
     25 ; Downsample pixel values of a single component.
     26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     27 ; without smoothing.
     28 ;
     29 ; GLOBAL(void)
     30 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
     31 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
     32 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
     33 ;
     34 
     35 %define img_width(b)	(b)+8			; JDIMENSION image_width
     36 %define max_v_samp(b)	(b)+12		; int max_v_samp_factor
     37 %define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
     38 %define width_blks(b)	(b)+20		; JDIMENSION width_blocks
     39 %define input_data(b)	(b)+24		; JSAMPARRAY input_data
     40 %define output_data(b)	(b)+28		; JSAMPARRAY output_data
     41 
     42 	align	16
     43 	global	EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
     44 
     45 EXTN(jsimd_h2v1_downsample_sse2):
     46 	push	ebp
     47 	mov	ebp,esp
     48 ;	push	ebx		; unused
     49 ;	push	ecx		; need not be preserved
     50 ;	push	edx		; need not be preserved
     51 	push	esi
     52 	push	edi
     53 
     54 	mov	ecx, JDIMENSION [width_blks(ebp)]
     55 	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
     56 	jz	near .return
     57 
     58 	mov	edx, JDIMENSION [img_width(ebp)]
     59 
     60 	; -- expand_right_edge
     61 
     62 	push	ecx
     63 	shl	ecx,1				; output_cols * 2
     64 	sub	ecx,edx
     65 	jle	short .expand_end
     66 
     67 	mov	eax, INT [max_v_samp(ebp)]
     68 	test	eax,eax
     69 	jle	short .expand_end
     70 
     71 	cld
     72 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
     73 	alignx	16,7
     74 .expandloop:
     75 	push	eax
     76 	push	ecx
     77 
     78 	mov	edi, JSAMPROW [esi]
     79 	add	edi,edx
     80 	mov	al, JSAMPLE [edi-1]
     81 
     82 	rep stosb
     83 
     84 	pop	ecx
     85 	pop	eax
     86 
     87 	add	esi, byte SIZEOF_JSAMPROW
     88 	dec	eax
     89 	jg	short .expandloop
     90 
     91 .expand_end:
     92 	pop	ecx				; output_cols
     93 
     94 	; -- h2v1_downsample
     95 
     96 	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
     97 	test	eax,eax
     98 	jle	near .return
     99 
    100 	mov	edx, 0x00010000		; bias pattern
    101 	movd	xmm7,edx
    102 	pcmpeqw	xmm6,xmm6
    103 	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
    104 	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
    105 
    106 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    107 	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
    108 	alignx	16,7
    109 .rowloop:
    110 	push	ecx
    111 	push	edi
    112 	push	esi
    113 
    114 	mov	esi, JSAMPROW [esi]		; inptr
    115 	mov	edi, JSAMPROW [edi]		; outptr
    116 
    117 	cmp	ecx, byte SIZEOF_XMMWORD
    118 	jae	short .columnloop
    119 	alignx	16,7
    120 
    121 .columnloop_r8:
    122 	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    123 	pxor	xmm1,xmm1
    124 	mov	ecx, SIZEOF_XMMWORD
    125 	jmp	short .downsample
    126 	alignx	16,7
    127 
    128 .columnloop:
    129 	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    130 	movdqa	xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
    131 
    132 .downsample:
    133 	movdqa	xmm2,xmm0
    134 	movdqa	xmm3,xmm1
    135 
    136 	pand	xmm0,xmm6
    137 	psrlw	xmm2,BYTE_BIT
    138 	pand	xmm1,xmm6
    139 	psrlw	xmm3,BYTE_BIT
    140 
    141 	paddw	xmm0,xmm2
    142 	paddw	xmm1,xmm3
    143 	paddw	xmm0,xmm7
    144 	paddw	xmm1,xmm7
    145 	psrlw	xmm0,1
    146 	psrlw	xmm1,1
    147 
    148 	packuswb xmm0,xmm1
    149 
    150 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    151 
    152 	sub	ecx, byte SIZEOF_XMMWORD	; outcol
    153 	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
    154 	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
    155 	cmp	ecx, byte SIZEOF_XMMWORD
    156 	jae	short .columnloop
    157 	test	ecx,ecx
    158 	jnz	short .columnloop_r8
    159 
    160 	pop	esi
    161 	pop	edi
    162 	pop	ecx
    163 
    164 	add	esi, byte SIZEOF_JSAMPROW	; input_data
    165 	add	edi, byte SIZEOF_JSAMPROW	; output_data
    166 	dec	eax				; rowctr
    167 	jg	near .rowloop
    168 
    169 .return:
    170 	pop	edi
    171 	pop	esi
    172 ;	pop	edx		; need not be preserved
    173 ;	pop	ecx		; need not be preserved
    174 ;	pop	ebx		; unused
    175 	pop	ebp
    176 	ret
    177 
    178 ; --------------------------------------------------------------------------
    179 ;
    180 ; Downsample pixel values of a single component.
    181 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    182 ; without smoothing.
    183 ;
    184 ; GLOBAL(void)
    185 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
    186 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    187 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
    188 ;
    189 
    190 %define img_width(b)	(b)+8			; JDIMENSION image_width
    191 %define max_v_samp(b)	(b)+12		; int max_v_samp_factor
    192 %define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
    193 %define width_blks(b)	(b)+20		; JDIMENSION width_blocks
    194 %define input_data(b)	(b)+24		; JSAMPARRAY input_data
    195 %define output_data(b)	(b)+28	; JSAMPARRAY output_data
    196 
    197 	align	16
    198 	global	EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
    199 
    200 EXTN(jsimd_h2v2_downsample_sse2):
    201 	push	ebp
    202 	mov	ebp,esp
    203 ;	push	ebx		; unused
    204 ;	push	ecx		; need not be preserved
    205 ;	push	edx		; need not be preserved
    206 	push	esi
    207 	push	edi
    208 
    209 	mov	ecx, JDIMENSION [width_blks(ebp)]
    210 	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
    211 	jz	near .return
    212 
    213 	mov	edx, JDIMENSION [img_width(ebp)]
    214 
    215 	; -- expand_right_edge
    216 
    217 	push	ecx
    218 	shl	ecx,1				; output_cols * 2
    219 	sub	ecx,edx
    220 	jle	short .expand_end
    221 
    222 	mov	eax, INT [max_v_samp(ebp)]
    223 	test	eax,eax
    224 	jle	short .expand_end
    225 
    226 	cld
    227 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    228 	alignx	16,7
    229 .expandloop:
    230 	push	eax
    231 	push	ecx
    232 
    233 	mov	edi, JSAMPROW [esi]
    234 	add	edi,edx
    235 	mov	al, JSAMPLE [edi-1]
    236 
    237 	rep stosb
    238 
    239 	pop	ecx
    240 	pop	eax
    241 
    242 	add	esi, byte SIZEOF_JSAMPROW
    243 	dec	eax
    244 	jg	short .expandloop
    245 
    246 .expand_end:
    247 	pop	ecx				; output_cols
    248 
    249 	; -- h2v2_downsample
    250 
    251 	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
    252 	test	eax,eax
    253 	jle	near .return
    254 
    255 	mov	edx, 0x00020001		; bias pattern
    256 	movd	xmm7,edx
    257 	pcmpeqw	xmm6,xmm6
    258 	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
    259 	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
    260 
    261 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    262 	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
    263 	alignx	16,7
    264 .rowloop:
    265 	push	ecx
    266 	push	edi
    267 	push	esi
    268 
    269 	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
    270 	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
    271 	mov	edi, JSAMPROW [edi]			; outptr
    272 
    273 	cmp	ecx, byte SIZEOF_XMMWORD
    274 	jae	short .columnloop
    275 	alignx	16,7
    276 
    277 .columnloop_r8:
    278 	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
    279 	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    280 	pxor	xmm2,xmm2
    281 	pxor	xmm3,xmm3
    282 	mov	ecx, SIZEOF_XMMWORD
    283 	jmp	short .downsample
    284 	alignx	16,7
    285 
    286 .columnloop:
    287 	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
    288 	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    289 	movdqa	xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
    290 	movdqa	xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
    291 
    292 .downsample:
    293 	movdqa	xmm4,xmm0
    294 	movdqa	xmm5,xmm1
    295 	pand	xmm0,xmm6
    296 	psrlw	xmm4,BYTE_BIT
    297 	pand	xmm1,xmm6
    298 	psrlw	xmm5,BYTE_BIT
    299 	paddw	xmm0,xmm4
    300 	paddw	xmm1,xmm5
    301 
    302 	movdqa	xmm4,xmm2
    303 	movdqa	xmm5,xmm3
    304 	pand	xmm2,xmm6
    305 	psrlw	xmm4,BYTE_BIT
    306 	pand	xmm3,xmm6
    307 	psrlw	xmm5,BYTE_BIT
    308 	paddw	xmm2,xmm4
    309 	paddw	xmm3,xmm5
    310 
    311 	paddw	xmm0,xmm1
    312 	paddw	xmm2,xmm3
    313 	paddw	xmm0,xmm7
    314 	paddw	xmm2,xmm7
    315 	psrlw	xmm0,2
    316 	psrlw	xmm2,2
    317 
    318 	packuswb xmm0,xmm2
    319 
    320 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    321 
    322 	sub	ecx, byte SIZEOF_XMMWORD	; outcol
    323 	add	edx, byte 2*SIZEOF_XMMWORD	; inptr0
    324 	add	esi, byte 2*SIZEOF_XMMWORD	; inptr1
    325 	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
    326 	cmp	ecx, byte SIZEOF_XMMWORD
    327 	jae	near .columnloop
    328 	test	ecx,ecx
    329 	jnz	near .columnloop_r8
    330 
    331 	pop	esi
    332 	pop	edi
    333 	pop	ecx
    334 
    335 	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
    336 	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
    337 	dec	eax				; rowctr
    338 	jg	near .rowloop
    339 
    340 .return:
    341 	pop	edi
    342 	pop	esi
    343 ;	pop	edx		; need not be preserved
    344 ;	pop	ecx		; need not be preserved
    345 ;	pop	ebx		; unused
    346 	pop	ebp
    347 	ret
    348 
    349 ; For some reason, the OS X linker does not honor the request to align the
    350 ; segment unless we do this.
    351 	align	16
    352