1 ; 2 ; jcsample.asm - downsampling (MMX) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB 5 ; 6 ; Based on the x86 SIMD extension for IJG JPEG library 7 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 9 ; 10 ; This file should be assembled with NASM (Netwide Assembler), 11 ; can *not* be assembled with Microsoft's MASM or any compatible 12 ; assembler (including Borland's Turbo Assembler). 13 ; NASM is available from http://nasm.sourceforge.net/ or 14 ; http://sourceforge.net/project/showfiles.php?group_id=6208 15 ; 16 ; [TAB8] 17 18 %include "jsimdext.inc" 19 20 ; -------------------------------------------------------------------------- 21 SECTION SEG_TEXT 22 BITS 32 23 ; 24 ; Downsample pixel values of a single component. 25 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, 26 ; without smoothing. 27 ; 28 ; GLOBAL(void) 29 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, 30 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 31 ; JSAMPARRAY input_data, JSAMPARRAY output_data); 32 ; 33 34 %define img_width(b) (b)+8 ; JDIMENSION image_width 35 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor 36 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 37 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks 38 %define input_data(b) (b)+24 ; JSAMPARRAY input_data 39 %define output_data(b) (b)+28 ; JSAMPARRAY output_data 40 41 align 16 42 global EXTN(jsimd_h2v1_downsample_mmx) 43 44 EXTN(jsimd_h2v1_downsample_mmx): 45 push ebp 46 mov ebp,esp 47 ; push ebx ; unused 48 ; push ecx ; need not be preserved 49 ; push edx ; need not be preserved 50 push esi 51 push edi 52 53 mov ecx, JDIMENSION [width_blks(ebp)] 54 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 55 jz near .return 56 57 mov edx, JDIMENSION [img_width(ebp)] 58 59 ; -- expand_right_edge 60 61 push ecx 62 shl ecx,1 ; output_cols * 2 63 sub ecx,edx 64 jle short .expand_end 65 66 mov eax, INT [max_v_samp(ebp)] 67 test eax,eax 68 jle short .expand_end 69 70 cld 71 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 72 alignx 16,7 73 .expandloop: 74 push eax 75 push ecx 76 77 mov edi, JSAMPROW [esi] 78 add edi,edx 79 mov al, JSAMPLE [edi-1] 80 81 rep stosb 82 83 pop ecx 84 pop eax 85 86 add esi, byte SIZEOF_JSAMPROW 87 dec eax 88 jg short .expandloop 89 90 .expand_end: 91 pop ecx ; output_cols 92 93 ; -- h2v1_downsample 94 95 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 96 test eax,eax 97 jle near .return 98 99 mov edx, 0x00010000 ; bias pattern 100 movd mm7,edx 101 pcmpeqw mm6,mm6 102 punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} 103 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 104 105 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 106 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 107 alignx 16,7 108 .rowloop: 109 push ecx 110 push edi 111 push esi 112 113 mov esi, JSAMPROW [esi] ; inptr 114 mov edi, JSAMPROW [edi] ; outptr 115 alignx 16,7 116 .columnloop: 117 118 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 119 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] 120 movq mm2,mm0 121 movq mm3,mm1 122 123 pand mm0,mm6 124 psrlw mm2,BYTE_BIT 125 pand mm1,mm6 126 psrlw mm3,BYTE_BIT 127 128 paddw mm0,mm2 129 paddw mm1,mm3 130 paddw mm0,mm7 131 paddw mm1,mm7 132 psrlw mm0,1 133 psrlw mm1,1 134 135 packuswb mm0,mm1 136 137 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 138 139 add esi, byte 2*SIZEOF_MMWORD ; inptr 140 add edi, byte 1*SIZEOF_MMWORD ; outptr 141 sub ecx, byte SIZEOF_MMWORD ; outcol 142 jnz short .columnloop 143 144 pop esi 145 pop edi 146 pop ecx 147 148 add esi, byte SIZEOF_JSAMPROW ; input_data 149 add edi, byte SIZEOF_JSAMPROW ; output_data 150 dec eax ; rowctr 151 jg short .rowloop 152 153 emms ; empty MMX state 154 155 .return: 156 pop edi 157 pop esi 158 ; pop edx ; need not be preserved 159 ; pop ecx ; need not be preserved 160 ; pop ebx ; unused 161 pop ebp 162 ret 163 164 ; -------------------------------------------------------------------------- 165 ; 166 ; Downsample pixel values of a single component. 167 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 168 ; without smoothing. 169 ; 170 ; GLOBAL(void) 171 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, 172 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 173 ; JSAMPARRAY input_data, JSAMPARRAY output_data); 174 ; 175 176 %define img_width(b) (b)+8 ; JDIMENSION image_width 177 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor 178 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 179 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks 180 %define input_data(b) (b)+24 ; JSAMPARRAY input_data 181 %define output_data(b) (b)+28 ; JSAMPARRAY output_data 182 183 align 16 184 global EXTN(jsimd_h2v2_downsample_mmx) 185 186 EXTN(jsimd_h2v2_downsample_mmx): 187 push ebp 188 mov ebp,esp 189 ; push ebx ; unused 190 ; push ecx ; need not be preserved 191 ; push edx ; need not be preserved 192 push esi 193 push edi 194 195 mov ecx, JDIMENSION [width_blks(ebp)] 196 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 197 jz near .return 198 199 mov edx, JDIMENSION [img_width(ebp)] 200 201 ; -- expand_right_edge 202 203 push ecx 204 shl ecx,1 ; output_cols * 2 205 sub ecx,edx 206 jle short .expand_end 207 208 mov eax, INT [max_v_samp(ebp)] 209 test eax,eax 210 jle short .expand_end 211 212 cld 213 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 214 alignx 16,7 215 .expandloop: 216 push eax 217 push ecx 218 219 mov edi, JSAMPROW [esi] 220 add edi,edx 221 mov al, JSAMPLE [edi-1] 222 223 rep stosb 224 225 pop ecx 226 pop eax 227 228 add esi, byte SIZEOF_JSAMPROW 229 dec eax 230 jg short .expandloop 231 232 .expand_end: 233 pop ecx ; output_cols 234 235 ; -- h2v2_downsample 236 237 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 238 test eax,eax 239 jle near .return 240 241 mov edx, 0x00020001 ; bias pattern 242 movd mm7,edx 243 pcmpeqw mm6,mm6 244 punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} 245 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 246 247 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 248 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 249 alignx 16,7 250 .rowloop: 251 push ecx 252 push edi 253 push esi 254 255 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 256 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 257 mov edi, JSAMPROW [edi] ; outptr 258 alignx 16,7 259 .columnloop: 260 261 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] 262 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 263 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] 264 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] 265 266 movq mm4,mm0 267 movq mm5,mm1 268 pand mm0,mm6 269 psrlw mm4,BYTE_BIT 270 pand mm1,mm6 271 psrlw mm5,BYTE_BIT 272 paddw mm0,mm4 273 paddw mm1,mm5 274 275 movq mm4,mm2 276 movq mm5,mm3 277 pand mm2,mm6 278 psrlw mm4,BYTE_BIT 279 pand mm3,mm6 280 psrlw mm5,BYTE_BIT 281 paddw mm2,mm4 282 paddw mm3,mm5 283 284 paddw mm0,mm1 285 paddw mm2,mm3 286 paddw mm0,mm7 287 paddw mm2,mm7 288 psrlw mm0,2 289 psrlw mm2,2 290 291 packuswb mm0,mm2 292 293 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 294 295 add edx, byte 2*SIZEOF_MMWORD ; inptr0 296 add esi, byte 2*SIZEOF_MMWORD ; inptr1 297 add edi, byte 1*SIZEOF_MMWORD ; outptr 298 sub ecx, byte SIZEOF_MMWORD ; outcol 299 jnz near .columnloop 300 301 pop esi 302 pop edi 303 pop ecx 304 305 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 306 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 307 dec eax ; rowctr 308 jg near .rowloop 309 310 emms ; empty MMX state 311 312 .return: 313 pop edi 314 pop esi 315 ; pop edx ; need not be preserved 316 ; pop ecx ; need not be preserved 317 ; pop ebx ; unused 318 pop ebp 319 ret 320 321 ; For some reason, the OS X linker does not honor the request to align the 322 ; segment unless we do this. 323 align 16 324