1 ; 2 ; jcsample.asm - downsampling (AVX2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB 5 ; Copyright (C) 2015, Intel Corporation. 6 ; Copyright (C) 2016, D. R. Commander. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler), 13 ; can *not* be assembled with Microsoft's MASM or any compatible 14 ; assembler (including Borland's Turbo Assembler). 15 ; NASM is available from http://nasm.sourceforge.net/ or 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 17 ; 18 ; [TAB8] 19 20 %include "jsimdext.inc" 21 22 ; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 32 25 ; 26 ; Downsample pixel values of a single component. 27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, 28 ; without smoothing. 29 ; 30 ; GLOBAL(void) 31 ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 32 ; JDIMENSION v_samp_factor, 33 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 34 ; JSAMPARRAY output_data); 35 ; 36 37 %define img_width(b) (b) + 8 ; JDIMENSION image_width 38 %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 39 %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 40 %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 41 %define input_data(b) (b) + 24 ; JSAMPARRAY input_data 42 %define output_data(b) (b) + 28 ; JSAMPARRAY output_data 43 44 align 32 45 GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) 46 47 EXTN(jsimd_h2v1_downsample_avx2): 48 push ebp 49 mov ebp, esp 50 ; push ebx ; unused 51 ; push ecx ; need not be preserved 52 ; push edx ; need not be preserved 53 push esi 54 push edi 55 56 mov ecx, JDIMENSION [width_blks(ebp)] 57 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 58 jz near .return 59 60 mov edx, JDIMENSION [img_width(ebp)] 61 62 ; -- expand_right_edge 63 64 push ecx 65 shl ecx, 1 ; output_cols * 2 66 sub ecx, edx 67 jle short .expand_end 68 69 mov eax, INT [max_v_samp(ebp)] 70 test eax, eax 71 jle short .expand_end 72 73 cld 74 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 75 alignx 16, 7 76 .expandloop: 77 push eax 78 push ecx 79 80 mov edi, JSAMPROW [esi] 81 add edi, edx 82 mov al, JSAMPLE [edi-1] 83 84 rep stosb 85 86 pop ecx 87 pop eax 88 89 add esi, byte SIZEOF_JSAMPROW 90 dec eax 91 jg short .expandloop 92 93 .expand_end: 94 pop ecx ; output_cols 95 96 ; -- h2v1_downsample 97 98 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 99 test eax, eax 100 jle near .return 101 102 mov edx, 0x00010000 ; bias pattern 103 vmovd xmm7, edx 104 vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 105 vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} 106 vpcmpeqw ymm6, ymm6, ymm6 107 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 108 109 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 110 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 111 alignx 16, 7 112 .rowloop: 113 push ecx 114 push edi 115 push esi 116 117 mov esi, JSAMPROW [esi] ; inptr 118 mov edi, JSAMPROW [edi] ; outptr 119 120 cmp ecx, byte SIZEOF_YMMWORD 121 jae short .columnloop 122 alignx 16, 7 123 124 .columnloop_r24: 125 ; ecx can possibly be 8, 16, 24 126 cmp ecx, 24 127 jne .columnloop_r16 128 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 129 vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD] 130 mov ecx, SIZEOF_YMMWORD 131 jmp short .downsample 132 133 .columnloop_r16: 134 cmp ecx, 16 135 jne .columnloop_r8 136 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 137 vpxor ymm1, ymm1, ymm1 138 mov ecx, SIZEOF_YMMWORD 139 jmp short .downsample 140 141 .columnloop_r8: 142 vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD] 143 vpxor ymm1, ymm1, ymm1 144 mov ecx, SIZEOF_YMMWORD 145 jmp short .downsample 146 alignx 16, 7 147 148 .columnloop: 149 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 150 vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD] 151 152 .downsample: 153 vpsrlw ymm2, ymm0, BYTE_BIT 154 vpand ymm0, ymm0, ymm6 155 vpsrlw ymm3, ymm1, BYTE_BIT 156 vpand ymm1, ymm1, ymm6 157 158 vpaddw ymm0, ymm0, ymm2 159 vpaddw ymm1, ymm1, ymm3 160 vpaddw ymm0, ymm0, ymm7 161 vpaddw ymm1, ymm1, ymm7 162 vpsrlw ymm0, ymm0, 1 163 vpsrlw ymm1, ymm1, 1 164 165 vpackuswb ymm0, ymm0, ymm1 166 vpermq ymm0, ymm0, 0xd8 167 168 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 169 170 sub ecx, byte SIZEOF_YMMWORD ; outcol 171 add esi, byte 2*SIZEOF_YMMWORD ; inptr 172 add edi, byte 1*SIZEOF_YMMWORD ; outptr 173 cmp ecx, byte SIZEOF_YMMWORD 174 jae short .columnloop 175 test ecx, ecx 176 jnz near .columnloop_r24 177 178 pop esi 179 pop edi 180 pop ecx 181 182 add esi, byte SIZEOF_JSAMPROW ; input_data 183 add edi, byte SIZEOF_JSAMPROW ; output_data 184 dec eax ; rowctr 185 jg near .rowloop 186 187 .return: 188 vzeroupper 189 pop edi 190 pop esi 191 ; pop edx ; need not be preserved 192 ; pop ecx ; need not be preserved 193 ; pop ebx ; unused 194 pop ebp 195 ret 196 197 ; -------------------------------------------------------------------------- 198 ; 199 ; Downsample pixel values of a single component. 200 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 201 ; without smoothing. 202 ; 203 ; GLOBAL(void) 204 ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 205 ; JDIMENSION v_samp_factor, 206 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 207 ; JSAMPARRAY output_data); 208 ; 209 210 %define img_width(b) (b) + 8 ; JDIMENSION image_width 211 %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 212 %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 213 %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 214 %define input_data(b) (b) + 24 ; JSAMPARRAY input_data 215 %define output_data(b) (b) + 28 ; JSAMPARRAY output_data 216 217 align 32 218 GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) 219 220 EXTN(jsimd_h2v2_downsample_avx2): 221 push ebp 222 mov ebp, esp 223 ; push ebx ; unused 224 ; push ecx ; need not be preserved 225 ; push edx ; need not be preserved 226 push esi 227 push edi 228 229 mov ecx, JDIMENSION [width_blks(ebp)] 230 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 231 jz near .return 232 233 mov edx, JDIMENSION [img_width(ebp)] 234 235 ; -- expand_right_edge 236 237 push ecx 238 shl ecx, 1 ; output_cols * 2 239 sub ecx, edx 240 jle short .expand_end 241 242 mov eax, INT [max_v_samp(ebp)] 243 test eax, eax 244 jle short .expand_end 245 246 cld 247 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 248 alignx 16, 7 249 .expandloop: 250 push eax 251 push ecx 252 253 mov edi, JSAMPROW [esi] 254 add edi, edx 255 mov al, JSAMPLE [edi-1] 256 257 rep stosb 258 259 pop ecx 260 pop eax 261 262 add esi, byte SIZEOF_JSAMPROW 263 dec eax 264 jg short .expandloop 265 266 .expand_end: 267 pop ecx ; output_cols 268 269 ; -- h2v2_downsample 270 271 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 272 test eax, eax 273 jle near .return 274 275 mov edx, 0x00020001 ; bias pattern 276 vmovd xmm7, edx 277 vpcmpeqw ymm6, ymm6, ymm6 278 vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} 279 vperm2i128 ymm7, ymm7, ymm7, 0 280 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 281 282 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 283 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 284 alignx 16, 7 285 .rowloop: 286 push ecx 287 push edi 288 push esi 289 290 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 291 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 292 mov edi, JSAMPROW [edi] ; outptr 293 294 cmp ecx, byte SIZEOF_YMMWORD 295 jae short .columnloop 296 alignx 16, 7 297 298 .columnloop_r24: 299 cmp ecx, 24 300 jne .columnloop_r16 301 vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] 302 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] 303 vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD] 304 vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD] 305 mov ecx, SIZEOF_YMMWORD 306 jmp short .downsample 307 308 .columnloop_r16: 309 cmp ecx, 16 310 jne .columnloop_r8 311 vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] 312 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] 313 vpxor ymm2, ymm2, ymm2 314 vpxor ymm3, ymm3, ymm3 315 mov ecx, SIZEOF_YMMWORD 316 jmp short .downsample 317 318 .columnloop_r8: 319 vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 320 vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 321 vpxor ymm2, ymm2, ymm2 322 vpxor ymm3, ymm3, ymm3 323 mov ecx, SIZEOF_YMMWORD 324 jmp short .downsample 325 alignx 16, 7 326 327 .columnloop: 328 vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] 329 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] 330 vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD] 331 vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD] 332 333 .downsample: 334 vpand ymm4, ymm0, ymm6 335 vpsrlw ymm0, ymm0, BYTE_BIT 336 vpand ymm5, ymm1, ymm6 337 vpsrlw ymm1, ymm1, BYTE_BIT 338 vpaddw ymm0, ymm0, ymm4 339 vpaddw ymm1, ymm1, ymm5 340 341 vpand ymm4, ymm2, ymm6 342 vpsrlw ymm2, ymm2, BYTE_BIT 343 vpand ymm5, ymm3, ymm6 344 vpsrlw ymm3, ymm3, BYTE_BIT 345 vpaddw ymm2, ymm2, ymm4 346 vpaddw ymm3, ymm3, ymm5 347 348 vpaddw ymm0, ymm0, ymm1 349 vpaddw ymm2, ymm2, ymm3 350 vpaddw ymm0, ymm0, ymm7 351 vpaddw ymm2, ymm2, ymm7 352 vpsrlw ymm0, ymm0, 2 353 vpsrlw ymm2, ymm2, 2 354 355 vpackuswb ymm0, ymm0, ymm2 356 vpermq ymm0, ymm0, 0xd8 357 358 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 359 360 sub ecx, byte SIZEOF_YMMWORD ; outcol 361 add edx, byte 2*SIZEOF_YMMWORD ; inptr0 362 add esi, byte 2*SIZEOF_YMMWORD ; inptr1 363 add edi, byte 1*SIZEOF_YMMWORD ; outptr 364 cmp ecx, byte SIZEOF_YMMWORD 365 jae near .columnloop 366 test ecx, ecx 367 jnz near .columnloop_r24 368 369 pop esi 370 pop edi 371 pop ecx 372 373 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 374 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 375 dec eax ; rowctr 376 jg near .rowloop 377 378 .return: 379 vzeroupper 380 pop edi 381 pop esi 382 ; pop edx ; need not be preserved 383 ; pop ecx ; need not be preserved 384 ; pop ebx ; unused 385 pop ebp 386 ret 387 388 ; For some reason, the OS X linker does not honor the request to align the 389 ; segment unless we do this. 390 align 32 391