1 ; 2 ; jcsample.asm - downsampling (MMX) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB 5 ; Copyright (C) 2016, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler), 12 ; can *not* be assembled with Microsoft's MASM or any compatible 13 ; assembler (including Borland's Turbo Assembler). 14 ; NASM is available from http://nasm.sourceforge.net/ or 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 16 ; 17 ; [TAB8] 18 19 %include "jsimdext.inc" 20 21 ; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 32 24 ; 25 ; Downsample pixel values of a single component. 26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, 27 ; without smoothing. 28 ; 29 ; GLOBAL(void) 30 ; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor, 31 ; JDIMENSION v_samp_factor, 32 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 33 ; JSAMPARRAY output_data); 34 ; 35 36 %define img_width(b) (b) + 8 ; JDIMENSION image_width 37 %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 38 %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 39 %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 40 %define input_data(b) (b) + 24 ; JSAMPARRAY input_data 41 %define output_data(b) (b) + 28 ; JSAMPARRAY output_data 42 43 align 32 44 GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx) 45 46 EXTN(jsimd_h2v1_downsample_mmx): 47 push ebp 48 mov ebp, esp 49 ; push ebx ; unused 50 ; push ecx ; need not be preserved 51 ; push edx ; need not be preserved 52 push esi 53 push edi 54 55 mov ecx, JDIMENSION [width_blks(ebp)] 56 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 57 jz near .return 58 59 mov edx, JDIMENSION [img_width(ebp)] 60 61 ; -- expand_right_edge 62 63 push ecx 64 shl ecx, 1 ; output_cols * 2 65 sub ecx, edx 66 jle short .expand_end 67 68 mov eax, INT [max_v_samp(ebp)] 69 test eax, eax 70 jle short .expand_end 71 72 cld 73 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 74 alignx 16, 7 75 .expandloop: 76 push eax 77 push ecx 78 79 mov edi, JSAMPROW [esi] 80 add edi, edx 81 mov al, JSAMPLE [edi-1] 82 83 rep stosb 84 85 pop ecx 86 pop eax 87 88 add esi, byte SIZEOF_JSAMPROW 89 dec eax 90 jg short .expandloop 91 92 .expand_end: 93 pop ecx ; output_cols 94 95 ; -- h2v1_downsample 96 97 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 98 test eax, eax 99 jle near .return 100 101 mov edx, 0x00010000 ; bias pattern 102 movd mm7, edx 103 pcmpeqw mm6, mm6 104 punpckldq mm7, mm7 ; mm7={0, 1, 0, 1} 105 psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 106 107 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 108 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 109 alignx 16, 7 110 .rowloop: 111 push ecx 112 push edi 113 push esi 114 115 mov esi, JSAMPROW [esi] ; inptr 116 mov edi, JSAMPROW [edi] ; outptr 117 alignx 16, 7 118 .columnloop: 119 120 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 121 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] 122 movq mm2, mm0 123 movq mm3, mm1 124 125 pand mm0, mm6 126 psrlw mm2, BYTE_BIT 127 pand mm1, mm6 128 psrlw mm3, BYTE_BIT 129 130 paddw mm0, mm2 131 paddw mm1, mm3 132 paddw mm0, mm7 133 paddw mm1, mm7 134 psrlw mm0, 1 135 psrlw mm1, 1 136 137 packuswb mm0, mm1 138 139 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 140 141 add esi, byte 2*SIZEOF_MMWORD ; inptr 142 add edi, byte 1*SIZEOF_MMWORD ; outptr 143 sub ecx, byte SIZEOF_MMWORD ; outcol 144 jnz short .columnloop 145 146 pop esi 147 pop edi 148 pop ecx 149 150 add esi, byte SIZEOF_JSAMPROW ; input_data 151 add edi, byte SIZEOF_JSAMPROW ; output_data 152 dec eax ; rowctr 153 jg short .rowloop 154 155 emms ; empty MMX state 156 157 .return: 158 pop edi 159 pop esi 160 ; pop edx ; need not be preserved 161 ; pop ecx ; need not be preserved 162 ; pop ebx ; unused 163 pop ebp 164 ret 165 166 ; -------------------------------------------------------------------------- 167 ; 168 ; Downsample pixel values of a single component. 169 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 170 ; without smoothing. 171 ; 172 ; GLOBAL(void) 173 ; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor, 174 ; JDIMENSION v_samp_factor, 175 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 176 ; JSAMPARRAY output_data); 177 ; 178 179 %define img_width(b) (b) + 8 ; JDIMENSION image_width 180 %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 181 %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 182 %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 183 %define input_data(b) (b) + 24 ; JSAMPARRAY input_data 184 %define output_data(b) (b) + 28 ; JSAMPARRAY output_data 185 186 align 32 187 GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx) 188 189 EXTN(jsimd_h2v2_downsample_mmx): 190 push ebp 191 mov ebp, esp 192 ; push ebx ; unused 193 ; push ecx ; need not be preserved 194 ; push edx ; need not be preserved 195 push esi 196 push edi 197 198 mov ecx, JDIMENSION [width_blks(ebp)] 199 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 200 jz near .return 201 202 mov edx, JDIMENSION [img_width(ebp)] 203 204 ; -- expand_right_edge 205 206 push ecx 207 shl ecx, 1 ; output_cols * 2 208 sub ecx, edx 209 jle short .expand_end 210 211 mov eax, INT [max_v_samp(ebp)] 212 test eax, eax 213 jle short .expand_end 214 215 cld 216 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 217 alignx 16, 7 218 .expandloop: 219 push eax 220 push ecx 221 222 mov edi, JSAMPROW [esi] 223 add edi, edx 224 mov al, JSAMPLE [edi-1] 225 226 rep stosb 227 228 pop ecx 229 pop eax 230 231 add esi, byte SIZEOF_JSAMPROW 232 dec eax 233 jg short .expandloop 234 235 .expand_end: 236 pop ecx ; output_cols 237 238 ; -- h2v2_downsample 239 240 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 241 test eax, eax 242 jle near .return 243 244 mov edx, 0x00020001 ; bias pattern 245 movd mm7, edx 246 pcmpeqw mm6, mm6 247 punpckldq mm7, mm7 ; mm7={1, 2, 1, 2} 248 psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 249 250 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 251 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 252 alignx 16, 7 253 .rowloop: 254 push ecx 255 push edi 256 push esi 257 258 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 259 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 260 mov edi, JSAMPROW [edi] ; outptr 261 alignx 16, 7 262 .columnloop: 263 264 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] 265 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 266 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] 267 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] 268 269 movq mm4, mm0 270 movq mm5, mm1 271 pand mm0, mm6 272 psrlw mm4, BYTE_BIT 273 pand mm1, mm6 274 psrlw mm5, BYTE_BIT 275 paddw mm0, mm4 276 paddw mm1, mm5 277 278 movq mm4, mm2 279 movq mm5, mm3 280 pand mm2, mm6 281 psrlw mm4, BYTE_BIT 282 pand mm3, mm6 283 psrlw mm5, BYTE_BIT 284 paddw mm2, mm4 285 paddw mm3, mm5 286 287 paddw mm0, mm1 288 paddw mm2, mm3 289 paddw mm0, mm7 290 paddw mm2, mm7 291 psrlw mm0, 2 292 psrlw mm2, 2 293 294 packuswb mm0, mm2 295 296 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 297 298 add edx, byte 2*SIZEOF_MMWORD ; inptr0 299 add esi, byte 2*SIZEOF_MMWORD ; inptr1 300 add edi, byte 1*SIZEOF_MMWORD ; outptr 301 sub ecx, byte SIZEOF_MMWORD ; outcol 302 jnz near .columnloop 303 304 pop esi 305 pop edi 306 pop ecx 307 308 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 309 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 310 dec eax ; rowctr 311 jg near .rowloop 312 313 emms ; empty MMX state 314 315 .return: 316 pop edi 317 pop esi 318 ; pop edx ; need not be preserved 319 ; pop ecx ; need not be preserved 320 ; pop ebx ; unused 321 pop ebp 322 ret 323 324 ; For some reason, the OS X linker does not honor the request to align the 325 ; segment unless we do this. 326 align 32 327