1 ; 2 ; jquantf.asm - sample data conversion and quantization (SSE & SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB 5 ; 6 ; Based on 7 ; x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler), 12 ; can *not* be assembled with Microsoft's MASM or any compatible 13 ; assembler (including Borland's Turbo Assembler). 14 ; NASM is available from http://nasm.sourceforge.net/ or 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 16 ; 17 ; [TAB8] 18 19 %include "jsimdext.inc" 20 %include "jdct.inc" 21 22 ; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 32 25 ; 26 ; Load data into workspace, applying unsigned->signed conversion 27 ; 28 ; GLOBAL(void) 29 ; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, 30 ; FAST_FLOAT * workspace); 31 ; 32 33 %define sample_data ebp+8 ; JSAMPARRAY sample_data 34 %define start_col ebp+12 ; JDIMENSION start_col 35 %define workspace ebp+16 ; FAST_FLOAT * workspace 36 37 align 16 38 global EXTN(jsimd_convsamp_float_sse2) 39 40 EXTN(jsimd_convsamp_float_sse2): 41 push ebp 42 mov ebp,esp 43 push ebx 44 ; push ecx ; need not be preserved 45 ; push edx ; need not be preserved 46 push esi 47 push edi 48 49 pcmpeqw xmm7,xmm7 50 psllw xmm7,7 51 packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) 52 53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 54 mov eax, JDIMENSION [start_col] 55 mov edi, POINTER [workspace] ; (DCTELEM *) 56 mov ecx, DCTSIZE/2 57 alignx 16,7 58 .convloop: 59 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 60 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 61 62 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 63 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 64 65 psubb xmm0,xmm7 ; xmm0=(01234567) 66 psubb xmm1,xmm7 ; xmm1=(89ABCDEF) 67 68 punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) 69 punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) 70 71 punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) 72 punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) 73 punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) 74 punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) 75 76 psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) 77 psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) 78 cvtdq2ps xmm2,xmm2 ; xmm2=(0123) 79 cvtdq2ps xmm0,xmm0 ; xmm0=(4567) 80 psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) 81 psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) 82 cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) 83 cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) 84 85 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 86 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 87 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 88 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 89 90 add esi, byte 2*SIZEOF_JSAMPROW 91 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 92 dec ecx 93 jnz short .convloop 94 95 pop edi 96 pop esi 97 ; pop edx ; need not be preserved 98 ; pop ecx ; need not be preserved 99 pop ebx 100 pop ebp 101 ret 102 103 104 ; -------------------------------------------------------------------------- 105 ; 106 ; Quantize/descale the coefficients, and store into coef_block 107 ; 108 ; GLOBAL(void) 109 ; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors, 110 ; FAST_FLOAT * workspace); 111 ; 112 113 %define coef_block ebp+8 ; JCOEFPTR coef_block 114 %define divisors ebp+12 ; FAST_FLOAT * divisors 115 %define workspace ebp+16 ; FAST_FLOAT * workspace 116 117 align 16 118 global EXTN(jsimd_quantize_float_sse2) 119 120 EXTN(jsimd_quantize_float_sse2): 121 push ebp 122 mov ebp,esp 123 ; push ebx ; unused 124 ; push ecx ; unused 125 ; push edx ; need not be preserved 126 push esi 127 push edi 128 129 mov esi, POINTER [workspace] 130 mov edx, POINTER [divisors] 131 mov edi, JCOEFPTR [coef_block] 132 mov eax, DCTSIZE2/16 133 alignx 16,7 134 .quantloop: 135 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 136 movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 137 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 138 mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 139 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 140 movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 141 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 142 mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 143 144 cvtps2dq xmm0,xmm0 145 cvtps2dq xmm1,xmm1 146 cvtps2dq xmm2,xmm2 147 cvtps2dq xmm3,xmm3 148 149 packssdw xmm0,xmm1 150 packssdw xmm2,xmm3 151 152 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 153 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 154 155 add esi, byte 16*SIZEOF_FAST_FLOAT 156 add edx, byte 16*SIZEOF_FAST_FLOAT 157 add edi, byte 16*SIZEOF_JCOEF 158 dec eax 159 jnz short .quantloop 160 161 pop edi 162 pop esi 163 ; pop edx ; need not be preserved 164 ; pop ecx ; unused 165 ; pop ebx ; unused 166 pop ebp 167 ret 168 169 ; For some reason, the OS X linker does not honor the request to align the 170 ; segment unless we do this. 171 align 16 172