Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcsammmx.asm - downsampling (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 	SECTION	SEG_TEXT
     23 	BITS	32
     24 ;
     25 ; Downsample pixel values of a single component.
     26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     27 ; without smoothing.
     28 ;
     29 ; GLOBAL(void)
     30 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
     31 ;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
     32 ;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
     33 ;
     34 
     35 %define img_width(b)	(b)+8			; JDIMENSION image_width
     36 %define max_v_samp(b)	(b)+12		; int max_v_samp_factor
     37 %define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
     38 %define width_blks(b)	(b)+20		; JDIMENSION width_blocks
     39 %define input_data(b)	(b)+24		; JSAMPARRAY input_data
     40 %define output_data(b)	(b)+28	; JSAMPARRAY output_data
     41 
     42 	align	16
     43 	global	EXTN(jsimd_h2v1_downsample_mmx) PRIVATE
     44 
     45 EXTN(jsimd_h2v1_downsample_mmx):
     46 	push	ebp
     47 	mov	ebp,esp
     48 ;	push	ebx		; unused
     49 ;	push	ecx		; need not be preserved
     50 ;	push	edx		; need not be preserved
     51 	push	esi
     52 	push	edi
     53 
     54 	mov	ecx, JDIMENSION [width_blks(ebp)]
     55 	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
     56 	jz	near .return
     57 
     58 	mov	edx, JDIMENSION [img_width(ebp)]
     59 
     60 	; -- expand_right_edge
     61 
     62 	push	ecx
     63 	shl	ecx,1				; output_cols * 2
     64 	sub	ecx,edx
     65 	jle	short .expand_end
     66 
     67 	mov	eax, INT [max_v_samp(ebp)]
     68 	test	eax,eax
     69 	jle	short .expand_end
     70 
     71 	cld
     72 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
     73 	alignx	16,7
     74 .expandloop:
     75 	push	eax
     76 	push	ecx
     77 
     78 	mov	edi, JSAMPROW [esi]
     79 	add	edi,edx
     80 	mov	al, JSAMPLE [edi-1]
     81 
     82 	rep stosb
     83 
     84 	pop	ecx
     85 	pop	eax
     86 
     87 	add	esi, byte SIZEOF_JSAMPROW
     88 	dec	eax
     89 	jg	short .expandloop
     90 
     91 .expand_end:
     92 	pop	ecx				; output_cols
     93 
     94 	; -- h2v1_downsample
     95 
     96 	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
     97 	test	eax,eax
     98 	jle	near .return
     99 
    100 	mov       edx, 0x00010000	; bias pattern
    101 	movd      mm7,edx
    102 	pcmpeqw   mm6,mm6
    103 	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
    104 	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
    105 
    106 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    107 	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
    108 	alignx	16,7
    109 .rowloop:
    110 	push	ecx
    111 	push	edi
    112 	push	esi
    113 
    114 	mov	esi, JSAMPROW [esi]		; inptr
    115 	mov	edi, JSAMPROW [edi]		; outptr
    116 	alignx	16,7
    117 .columnloop:
    118 
    119 	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    120 	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
    121 	movq	mm2,mm0
    122 	movq	mm3,mm1
    123 
    124 	pand	mm0,mm6
    125 	psrlw	mm2,BYTE_BIT
    126 	pand	mm1,mm6
    127 	psrlw	mm3,BYTE_BIT
    128 
    129 	paddw	mm0,mm2
    130 	paddw	mm1,mm3
    131 	paddw	mm0,mm7
    132 	paddw	mm1,mm7
    133 	psrlw	mm0,1
    134 	psrlw	mm1,1
    135 
    136 	packuswb mm0,mm1
    137 
    138 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
    139 
    140 	add	esi, byte 2*SIZEOF_MMWORD	; inptr
    141 	add	edi, byte 1*SIZEOF_MMWORD	; outptr
    142 	sub	ecx, byte SIZEOF_MMWORD		; outcol
    143 	jnz	short .columnloop
    144 
    145 	pop	esi
    146 	pop	edi
    147 	pop	ecx
    148 
    149 	add	esi, byte SIZEOF_JSAMPROW	; input_data
    150 	add	edi, byte SIZEOF_JSAMPROW	; output_data
    151 	dec	eax				; rowctr
    152 	jg	short .rowloop
    153 
    154 	emms		; empty MMX state
    155 
    156 .return:
    157 	pop	edi
    158 	pop	esi
    159 ;	pop	edx		; need not be preserved
    160 ;	pop	ecx		; need not be preserved
    161 ;	pop	ebx		; unused
    162 	pop	ebp
    163 	ret
    164 
    165 ; --------------------------------------------------------------------------
    166 ;
    167 ; Downsample pixel values of a single component.
    168 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    169 ; without smoothing.
    170 ;
    171 ; GLOBAL(void)
    172 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
    173 ;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    174 ;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
    175 ;
    176 
    177 %define img_width(b)	(b)+8			; JDIMENSION image_width
    178 %define max_v_samp(b)	(b)+12		; int max_v_samp_factor
    179 %define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
    180 %define width_blks(b)	(b)+20		; JDIMENSION width_blocks
    181 %define input_data(b)	(b)+24		; JSAMPARRAY input_data
    182 %define output_data(b)	(b)+28	; JSAMPARRAY output_data
    183 
    184 	align	16
    185 	global	EXTN(jsimd_h2v2_downsample_mmx) PRIVATE
    186 
    187 EXTN(jsimd_h2v2_downsample_mmx):
    188 	push	ebp
    189 	mov	ebp,esp
    190 ;	push	ebx		; unused
    191 ;	push	ecx		; need not be preserved
    192 ;	push	edx		; need not be preserved
    193 	push	esi
    194 	push	edi
    195 
    196 	mov	ecx, JDIMENSION [width_blks(ebp)]
    197 	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
    198 	jz	near .return
    199 
    200 	mov	edx, JDIMENSION [img_width(ebp)]
    201 
    202 	; -- expand_right_edge
    203 
    204 	push	ecx
    205 	shl	ecx,1				; output_cols * 2
    206 	sub	ecx,edx
    207 	jle	short .expand_end
    208 
    209 	mov	eax, INT [max_v_samp(ebp)]
    210 	test	eax,eax
    211 	jle	short .expand_end
    212 
    213 	cld
    214 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    215 	alignx	16,7
    216 .expandloop:
    217 	push	eax
    218 	push	ecx
    219 
    220 	mov	edi, JSAMPROW [esi]
    221 	add	edi,edx
    222 	mov	al, JSAMPLE [edi-1]
    223 
    224 	rep stosb
    225 
    226 	pop	ecx
    227 	pop	eax
    228 
    229 	add	esi, byte SIZEOF_JSAMPROW
    230 	dec	eax
    231 	jg	short .expandloop
    232 
    233 .expand_end:
    234 	pop	ecx				; output_cols
    235 
    236 	; -- h2v2_downsample
    237 
    238 	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
    239 	test	eax,eax
    240 	jle	near .return
    241 
    242 	mov       edx, 0x00020001	; bias pattern
    243 	movd      mm7,edx
    244 	pcmpeqw   mm6,mm6
    245 	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
    246 	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
    247 
    248 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    249 	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
    250 	alignx	16,7
    251 .rowloop:
    252 	push	ecx
    253 	push	edi
    254 	push	esi
    255 
    256 	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
    257 	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
    258 	mov	edi, JSAMPROW [edi]			; outptr
    259 	alignx	16,7
    260 .columnloop:
    261 
    262 	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
    263 	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
    264 	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
    265 	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
    266 
    267 	movq	mm4,mm0
    268 	movq	mm5,mm1
    269 	pand	mm0,mm6
    270 	psrlw	mm4,BYTE_BIT
    271 	pand	mm1,mm6
    272 	psrlw	mm5,BYTE_BIT
    273 	paddw	mm0,mm4
    274 	paddw	mm1,mm5
    275 
    276 	movq	mm4,mm2
    277 	movq	mm5,mm3
    278 	pand	mm2,mm6
    279 	psrlw	mm4,BYTE_BIT
    280 	pand	mm3,mm6
    281 	psrlw	mm5,BYTE_BIT
    282 	paddw	mm2,mm4
    283 	paddw	mm3,mm5
    284 
    285 	paddw	mm0,mm1
    286 	paddw	mm2,mm3
    287 	paddw	mm0,mm7
    288 	paddw	mm2,mm7
    289 	psrlw	mm0,2
    290 	psrlw	mm2,2
    291 
    292 	packuswb mm0,mm2
    293 
    294 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
    295 
    296 	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
    297 	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
    298 	add	edi, byte 1*SIZEOF_MMWORD	; outptr
    299 	sub	ecx, byte SIZEOF_MMWORD		; outcol
    300 	jnz	near .columnloop
    301 
    302 	pop	esi
    303 	pop	edi
    304 	pop	ecx
    305 
    306 	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
    307 	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
    308 	dec	eax				; rowctr
    309 	jg	near .rowloop
    310 
    311 	emms		; empty MMX state
    312 
    313 .return:
    314 	pop	edi
    315 	pop	esi
    316 ;	pop	edx		; need not be preserved
    317 ;	pop	ecx		; need not be preserved
    318 ;	pop	ebx		; unused
    319 	pop	ebp
    320 	ret
    321 
    322 ; For some reason, the OS X linker does not honor the request to align the
    323 ; segment unless we do this.
    324 	align	16
    325