1 ; 2 ; jcsamss2-64.asm - downsampling (64-bit SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB 5 ; Copyright 2009 D. R. Commander 6 ; 7 ; Based on 8 ; x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler), 13 ; can *not* be assembled with Microsoft's MASM or any compatible 14 ; assembler (including Borland's Turbo Assembler). 15 ; NASM is available from http://nasm.sourceforge.net/ or 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 17 ; 18 ; [TAB8] 19 20 %include "jsimdext.inc" 21 22 ; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 64 25 ; 26 ; Downsample pixel values of a single component. 27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, 28 ; without smoothing. 29 ; 30 ; GLOBAL(void) 31 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, 32 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 33 ; JSAMPARRAY input_data, JSAMPARRAY output_data); 34 ; 35 36 ; r10 = JDIMENSION image_width 37 ; r11 = int max_v_samp_factor 38 ; r12 = JDIMENSION v_samp_factor 39 ; r13 = JDIMENSION width_blocks 40 ; r14 = JSAMPARRAY input_data 41 ; r15 = JSAMPARRAY output_data 42 43 align 16 44 global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE 45 46 EXTN(jsimd_h2v1_downsample_sse2): 47 push rbp 48 mov rax,rsp 49 mov rbp,rsp 50 collect_args 51 52 mov rcx, r13 53 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) 54 jz near .return 55 56 mov rdx, r10 57 58 ; -- expand_right_edge 59 60 push rcx 61 shl rcx,1 ; output_cols * 2 62 sub rcx,rdx 63 jle short .expand_end 64 65 mov rax, r11 66 test rax,rax 67 jle short .expand_end 68 69 cld 70 mov rsi, r14 ; input_data 71 .expandloop: 72 push rax 73 push rcx 74 75 mov rdi, JSAMPROW [rsi] 76 add rdi,rdx 77 mov al, JSAMPLE [rdi-1] 78 79 rep stosb 80 81 pop rcx 82 pop rax 83 84 add rsi, byte SIZEOF_JSAMPROW 85 dec rax 86 jg short .expandloop 87 88 .expand_end: 89 pop rcx ; output_cols 90 91 ; -- h2v1_downsample 92 93 mov rax, r12 ; rowctr 94 test eax,eax 95 jle near .return 96 97 mov rdx, 0x00010000 ; bias pattern 98 movd xmm7,edx 99 pcmpeqw xmm6,xmm6 100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 102 103 mov rsi, r14 ; input_data 104 mov rdi, r15 ; output_data 105 .rowloop: 106 push rcx 107 push rdi 108 push rsi 109 110 mov rsi, JSAMPROW [rsi] ; inptr 111 mov rdi, JSAMPROW [rdi] ; outptr 112 113 cmp rcx, byte SIZEOF_XMMWORD 114 jae short .columnloop 115 116 .columnloop_r8: 117 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 118 pxor xmm1,xmm1 119 mov rcx, SIZEOF_XMMWORD 120 jmp short .downsample 121 122 .columnloop: 123 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 124 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] 125 126 .downsample: 127 movdqa xmm2,xmm0 128 movdqa xmm3,xmm1 129 130 pand xmm0,xmm6 131 psrlw xmm2,BYTE_BIT 132 pand xmm1,xmm6 133 psrlw xmm3,BYTE_BIT 134 135 paddw xmm0,xmm2 136 paddw xmm1,xmm3 137 paddw xmm0,xmm7 138 paddw xmm1,xmm7 139 psrlw xmm0,1 140 psrlw xmm1,1 141 142 packuswb xmm0,xmm1 143 144 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 145 146 sub rcx, byte SIZEOF_XMMWORD ; outcol 147 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 148 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 149 cmp rcx, byte SIZEOF_XMMWORD 150 jae short .columnloop 151 test rcx,rcx 152 jnz short .columnloop_r8 153 154 pop rsi 155 pop rdi 156 pop rcx 157 158 add rsi, byte SIZEOF_JSAMPROW ; input_data 159 add rdi, byte SIZEOF_JSAMPROW ; output_data 160 dec rax ; rowctr 161 jg near .rowloop 162 163 .return: 164 uncollect_args 165 pop rbp 166 ret 167 168 ; -------------------------------------------------------------------------- 169 ; 170 ; Downsample pixel values of a single component. 171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 172 ; without smoothing. 173 ; 174 ; GLOBAL(void) 175 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, 176 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 177 ; JSAMPARRAY input_data, JSAMPARRAY output_data); 178 ; 179 180 ; r10 = JDIMENSION image_width 181 ; r11 = int max_v_samp_factor 182 ; r12 = JDIMENSION v_samp_factor 183 ; r13 = JDIMENSION width_blocks 184 ; r14 = JSAMPARRAY input_data 185 ; r15 = JSAMPARRAY output_data 186 187 align 16 188 global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE 189 190 EXTN(jsimd_h2v2_downsample_sse2): 191 push rbp 192 mov rax,rsp 193 mov rbp,rsp 194 collect_args 195 196 mov rcx, r13 197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) 198 jz near .return 199 200 mov rdx, r10 201 202 ; -- expand_right_edge 203 204 push rcx 205 shl rcx,1 ; output_cols * 2 206 sub rcx,rdx 207 jle short .expand_end 208 209 mov rax, r11 210 test rax,rax 211 jle short .expand_end 212 213 cld 214 mov rsi, r14 ; input_data 215 .expandloop: 216 push rax 217 push rcx 218 219 mov rdi, JSAMPROW [rsi] 220 add rdi,rdx 221 mov al, JSAMPLE [rdi-1] 222 223 rep stosb 224 225 pop rcx 226 pop rax 227 228 add rsi, byte SIZEOF_JSAMPROW 229 dec rax 230 jg short .expandloop 231 232 .expand_end: 233 pop rcx ; output_cols 234 235 ; -- h2v2_downsample 236 237 mov rax, r12 ; rowctr 238 test rax,rax 239 jle near .return 240 241 mov rdx, 0x00020001 ; bias pattern 242 movd xmm7,edx 243 pcmpeqw xmm6,xmm6 244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 246 247 mov rsi, r14 ; input_data 248 mov rdi, r15 ; output_data 249 .rowloop: 250 push rcx 251 push rdi 252 push rsi 253 254 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 255 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 256 mov rdi, JSAMPROW [rdi] ; outptr 257 258 cmp rcx, byte SIZEOF_XMMWORD 259 jae short .columnloop 260 261 .columnloop_r8: 262 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 263 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 264 pxor xmm2,xmm2 265 pxor xmm3,xmm3 266 mov rcx, SIZEOF_XMMWORD 267 jmp short .downsample 268 269 .columnloop: 270 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 271 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 272 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] 273 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] 274 275 .downsample: 276 movdqa xmm4,xmm0 277 movdqa xmm5,xmm1 278 pand xmm0,xmm6 279 psrlw xmm4,BYTE_BIT 280 pand xmm1,xmm6 281 psrlw xmm5,BYTE_BIT 282 paddw xmm0,xmm4 283 paddw xmm1,xmm5 284 285 movdqa xmm4,xmm2 286 movdqa xmm5,xmm3 287 pand xmm2,xmm6 288 psrlw xmm4,BYTE_BIT 289 pand xmm3,xmm6 290 psrlw xmm5,BYTE_BIT 291 paddw xmm2,xmm4 292 paddw xmm3,xmm5 293 294 paddw xmm0,xmm1 295 paddw xmm2,xmm3 296 paddw xmm0,xmm7 297 paddw xmm2,xmm7 298 psrlw xmm0,2 299 psrlw xmm2,2 300 301 packuswb xmm0,xmm2 302 303 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 304 305 sub rcx, byte SIZEOF_XMMWORD ; outcol 306 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 307 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 308 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 309 cmp rcx, byte SIZEOF_XMMWORD 310 jae near .columnloop 311 test rcx,rcx 312 jnz near .columnloop_r8 313 314 pop rsi 315 pop rdi 316 pop rcx 317 318 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data 319 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data 320 dec rax ; rowctr 321 jg near .rowloop 322 323 .return: 324 uncollect_args 325 pop rbp 326 ret 327 328 ; For some reason, the OS X linker does not honor the request to align the 329 ; segment unless we do this. 330 align 16 331