Home | History | Annotate | Download | only in ia32
      1 ;  vim:filetype=nasm ts=8
      2 
      3 ;  libFLAC - Free Lossless Audio Codec library
      4 ;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
      5 ;
      6 ;  Redistribution and use in source and binary forms, with or without
      7 ;  modification, are permitted provided that the following conditions
      8 ;  are met:
      9 ;
     10 ;  - Redistributions of source code must retain the above copyright
     11 ;  notice, this list of conditions and the following disclaimer.
     12 ;
     13 ;  - Redistributions in binary form must reproduce the above copyright
     14 ;  notice, this list of conditions and the following disclaimer in the
     15 ;  documentation and/or other materials provided with the distribution.
     16 ;
     17 ;  - Neither the name of the Xiph.org Foundation nor the names of its
     18 ;  contributors may be used to endorse or promote products derived from
     19 ;  this software without specific prior written permission.
     20 ;
     21 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     22 ;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     23 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     24 ;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
     25 ;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     26 ;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     27 ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     28 ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     29 ;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     30 ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     31 ;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     32 
     33 %include "nasm.h"
     34 
     35 	data_section
     36 
     37 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
     38 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
     39 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
     40 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
     41 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
     42 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
     43 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
     44 cglobal FLAC__lpc_restore_signal_asm_ia32
     45 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
     46 
     47 	code_section
     48 
     49 ; **********************************************************************
     50 ;
     51 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
     52 ; {
     53 ;	FLAC__real d;
     54 ;	unsigned sample, coeff;
     55 ;	const unsigned limit = data_len - lag;
     56 ;
     57 ;	FLAC__ASSERT(lag > 0);
     58 ;	FLAC__ASSERT(lag <= data_len);
     59 ;
     60 ;	for(coeff = 0; coeff < lag; coeff++)
     61 ;		autoc[coeff] = 0.0;
     62 ;	for(sample = 0; sample <= limit; sample++) {
     63 ;		d = data[sample];
     64 ;		for(coeff = 0; coeff < lag; coeff++)
     65 ;			autoc[coeff] += d * data[sample+coeff];
     66 ;	}
     67 ;	for(; sample < data_len; sample++) {
     68 ;		d = data[sample];
     69 ;		for(coeff = 0; coeff < data_len - sample; coeff++)
     70 ;			autoc[coeff] += d * data[sample+coeff];
     71 ;	}
     72 ; }
     73 ;
     74 	ALIGN 16
     75 cident FLAC__lpc_compute_autocorrelation_asm_ia32
     76 	;[esp + 28] == autoc[]
     77 	;[esp + 24] == lag
     78 	;[esp + 20] == data_len
     79 	;[esp + 16] == data[]
     80 
     81 	;ASSERT(lag > 0)
     82 	;ASSERT(lag <= 33)
     83 	;ASSERT(lag <= data_len)
     84 
     85 .begin:
     86 	push	esi
     87 	push	edi
     88 	push	ebx
     89 
     90 	;	for(coeff = 0; coeff < lag; coeff++)
     91 	;		autoc[coeff] = 0.0;
     92 	mov	edi, [esp + 28]			; edi == autoc
     93 	mov	ecx, [esp + 24]			; ecx = # of dwords (=lag) of 0 to write
     94 	xor	eax, eax
     95 	rep	stosd
     96 
     97 	;	const unsigned limit = data_len - lag;
     98 	mov	eax, [esp + 24]			; eax == lag
     99 	mov	ecx, [esp + 20]
    100 	sub	ecx, eax			; ecx == limit
    101 
    102 	mov	edi, [esp + 28]			; edi == autoc
    103 	mov	esi, [esp + 16]			; esi == data
    104 	inc	ecx				; we are looping <= limit so we add one to the counter
    105 
    106 	;	for(sample = 0; sample <= limit; sample++) {
    107 	;		d = data[sample];
    108 	;		for(coeff = 0; coeff < lag; coeff++)
    109 	;			autoc[coeff] += d * data[sample+coeff];
    110 	;	}
    111 	fld	dword [esi]			; ST = d <- data[sample]
    112 	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
    113 	lea	edx, [eax + eax*2]
    114 	neg	edx
    115 	lea	edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
    116 	call	.get_eip1
    117 .get_eip1:
    118 	pop	ebx
    119 	add	edx, ebx
    120 	inc	edx				; compensate for the shorter opcode on the last iteration
    121 	inc	edx				; compensate for the shorter opcode on the last iteration
    122 	inc	edx				; compensate for the shorter opcode on the last iteration
    123 	cmp	eax, 33
    124 	jne	.loop1_start
    125 	sub	edx, byte 9			; compensate for the longer opcodes on the first iteration
    126 .loop1_start:
    127 	jmp	edx
    128 
    129 	fld	st0				; ST = d d
    130 	fmul	dword [esi + (32*4)]		; ST = d*data[sample+32] d		WATCHOUT: not a byte displacement here!
    131 	fadd	dword [edi + (32*4)]		; ST = autoc[32]+d*data[sample+32] d	WATCHOUT: not a byte displacement here!
    132 	fstp	dword [edi + (32*4)]		; autoc[32]+=d*data[sample+32]  ST = d	WATCHOUT: not a byte displacement here!
    133 	fld	st0				; ST = d d
    134 	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
    135 	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
    136 	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
    137 	fld	st0				; ST = d d
    138 	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
    139 	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
    140 	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
    141 	fld	st0				; ST = d d
    142 	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
    143 	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
    144 	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
    145 	fld	st0				; ST = d d
    146 	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
    147 	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
    148 	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
    149 	fld	st0				; ST = d d
    150 	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
    151 	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
    152 	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
    153 	fld	st0				; ST = d d
    154 	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
    155 	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
    156 	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
    157 	fld	st0				; ST = d d
    158 	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
    159 	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
    160 	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
    161 	fld	st0				; ST = d d
    162 	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
    163 	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
    164 	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
    165 	fld	st0				; ST = d d
    166 	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
    167 	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
    168 	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
    169 	fld	st0				; ST = d d
    170 	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
    171 	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
    172 	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
    173 	fld	st0				; ST = d d
    174 	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
    175 	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
    176 	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
    177 	fld	st0				; ST = d d
    178 	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
    179 	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
    180 	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
    181 	fld	st0				; ST = d d
    182 	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
    183 	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
    184 	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
    185 	fld	st0				; ST = d d
    186 	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
    187 	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
    188 	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
    189 	fld	st0				; ST = d d
    190 	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
    191 	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
    192 	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
    193 	fld	st0				; ST = d d
    194 	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
    195 	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
    196 	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
    197 	fld	st0				; ST = d d
    198 	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
    199 	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
    200 	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
    201 	fld	st0				; ST = d d
    202 	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
    203 	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
    204 	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
    205 	fld	st0				; ST = d d
    206 	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
    207 	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
    208 	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
    209 	fld	st0				; ST = d d
    210 	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
    211 	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
    212 	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
    213 	fld	st0				; ST = d d
    214 	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
    215 	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
    216 	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
    217 	fld	st0				; ST = d d
    218 	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
    219 	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
    220 	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
    221 	fld	st0				; ST = d d
    222 	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
    223 	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
    224 	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
    225 	fld	st0				; ST = d d
    226 	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
    227 	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
    228 	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
    229 	fld	st0				; ST = d d
    230 	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
    231 	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
    232 	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
    233 	fld	st0				; ST = d d
    234 	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
    235 	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
    236 	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
    237 	fld	st0				; ST = d d
    238 	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
    239 	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
    240 	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
    241 	fld	st0				; ST = d d
    242 	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
    243 	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
    244 	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
    245 	fld	st0				; ST = d d
    246 	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
    247 	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
    248 	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
    249 	fld	st0				; ST = d d
    250 	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
    251 	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
    252 	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
    253 	fld	st0				; ST = d d
    254 	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
    255 	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
    256 	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
    257 	fld	st0				; ST = d d
    258 	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
    259 	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
    260 	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
    261 .jumper1_0:
    262 
    263 	fstp	st0				; pop d, ST = empty
    264 	add	esi, byte 4			; sample++
    265 	dec	ecx
    266 	jz	.loop1_end
    267 	fld	dword [esi]			; ST = d <- data[sample]
    268 	jmp	edx
    269 .loop1_end:
    270 
    271 	;	for(; sample < data_len; sample++) {
    272 	;		d = data[sample];
    273 	;		for(coeff = 0; coeff < data_len - sample; coeff++)
    274 	;			autoc[coeff] += d * data[sample+coeff];
    275 	;	}
    276 	mov	ecx, [esp + 24]			; ecx <- lag
    277 	dec	ecx				; ecx <- lag - 1
    278 	jz	near .end			; skip loop if 0 (i.e. lag == 1)
    279 
    280 	fld	dword [esi]			; ST = d <- data[sample]
    281 	mov	eax, ecx			; eax <- lag - 1 == data_len - sample the first time through
    282 	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
    283 	lea	edx, [eax + eax*2]
    284 	neg	edx
    285 	lea	edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
    286 	call	.get_eip2
    287 .get_eip2:
    288 	pop	ebx
    289 	add	edx, ebx
    290 	inc	edx				; compensate for the shorter opcode on the last iteration
    291 	inc	edx				; compensate for the shorter opcode on the last iteration
    292 	inc	edx				; compensate for the shorter opcode on the last iteration
    293 	jmp	edx
    294 
    295 	fld	st0				; ST = d d
    296 	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
    297 	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
    298 	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
    299 	fld	st0				; ST = d d
    300 	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
    301 	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
    302 	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
    303 	fld	st0				; ST = d d
    304 	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
    305 	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
    306 	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
    307 	fld	st0				; ST = d d
    308 	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
    309 	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
    310 	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
    311 	fld	st0				; ST = d d
    312 	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
    313 	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
    314 	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
    315 	fld	st0				; ST = d d
    316 	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
    317 	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
    318 	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
    319 	fld	st0				; ST = d d
    320 	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
    321 	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
    322 	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
    323 	fld	st0				; ST = d d
    324 	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
    325 	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
    326 	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
    327 	fld	st0				; ST = d d
    328 	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
    329 	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
    330 	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
    331 	fld	st0				; ST = d d
    332 	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
    333 	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
    334 	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
    335 	fld	st0				; ST = d d
    336 	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
    337 	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
    338 	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
    339 	fld	st0				; ST = d d
    340 	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
    341 	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
    342 	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
    343 	fld	st0				; ST = d d
    344 	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
    345 	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
    346 	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
    347 	fld	st0				; ST = d d
    348 	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
    349 	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
    350 	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
    351 	fld	st0				; ST = d d
    352 	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
    353 	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
    354 	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
    355 	fld	st0				; ST = d d
    356 	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
    357 	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
    358 	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
    359 	fld	st0				; ST = d d
    360 	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
    361 	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
    362 	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
    363 	fld	st0				; ST = d d
    364 	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
    365 	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
    366 	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
    367 	fld	st0				; ST = d d
    368 	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
    369 	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
    370 	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
    371 	fld	st0				; ST = d d
    372 	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
    373 	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
    374 	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
    375 	fld	st0				; ST = d d
    376 	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
    377 	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
    378 	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
    379 	fld	st0				; ST = d d
    380 	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
    381 	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
    382 	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
    383 	fld	st0				; ST = d d
    384 	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
    385 	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
    386 	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
    387 	fld	st0				; ST = d d
    388 	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
    389 	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
    390 	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
    391 	fld	st0				; ST = d d
    392 	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
    393 	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
    394 	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
    395 	fld	st0				; ST = d d
    396 	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
    397 	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
    398 	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
    399 	fld	st0				; ST = d d
    400 	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
    401 	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
    402 	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
    403 	fld	st0				; ST = d d
    404 	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
    405 	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
    406 	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
    407 	fld	st0				; ST = d d
    408 	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
    409 	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
    410 	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
    411 	fld	st0				; ST = d d
    412 	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
    413 	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
    414 	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
    415 	fld	st0				; ST = d d
    416 	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
    417 	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
    418 	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
    419 	fld	st0				; ST = d d
    420 	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
    421 	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
    422 	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
    423 .jumper2_0:
    424 
    425 	fstp	st0				; pop d, ST = empty
    426 	add	esi, byte 4			; sample++
    427 	dec	ecx
    428 	jz	.loop2_end
    429 	add	edx, byte 11			; adjust our inner loop counter by adjusting the jump target
    430 	fld	dword [esi]			; ST = d <- data[sample]
    431 	jmp	edx
    432 .loop2_end:
    433 
    434 .end:
    435 	pop	ebx
    436 	pop	edi
    437 	pop	esi
    438 	ret
    439 
    440 	ALIGN 16
    441 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
    442 	;[esp + 16] == autoc[]
    443 	;[esp + 12] == lag
    444 	;[esp + 8] == data_len
    445 	;[esp + 4] == data[]
    446 
    447 	;ASSERT(lag > 0)
    448 	;ASSERT(lag <= 4)
    449 	;ASSERT(lag <= data_len)
    450 
    451 	;	for(coeff = 0; coeff < lag; coeff++)
    452 	;		autoc[coeff] = 0.0;
    453 	xorps	xmm5, xmm5
    454 
    455 	mov	edx, [esp + 8]			; edx == data_len
    456 	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
    457 
    458 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
    459 	add	eax, 4
    460 	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
    461 	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
    462 .warmup:					; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
    463 	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
    464 	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
    465 	dec	edx
    466 	jz	.loop_end
    467 	ALIGN 16
    468 .loop_start:
    469 	; start by reading the next sample
    470 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
    471 	add	eax, 4
    472 	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
    473 	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
    474 	movss	xmm2, xmm0
    475 	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
    476 	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
    477 	dec	edx
    478 	jnz	.loop_start
    479 .loop_end:
    480 	; store autoc
    481 	mov	edx, [esp + 16]			; edx == autoc
    482 	movups	[edx], xmm5
    483 
    484 .end:
    485 	ret
    486 
    487 	ALIGN 16
    488 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
    489 	;[esp + 16] == autoc[]
    490 	;[esp + 12] == lag
    491 	;[esp + 8] == data_len
    492 	;[esp + 4] == data[]
    493 
    494 	;ASSERT(lag > 0)
    495 	;ASSERT(lag <= 8)
    496 	;ASSERT(lag <= data_len)
    497 
    498 	;	for(coeff = 0; coeff < lag; coeff++)
    499 	;		autoc[coeff] = 0.0;
    500 	xorps	xmm5, xmm5
    501 	xorps	xmm6, xmm6
    502 
    503 	mov	edx, [esp + 8]			; edx == data_len
    504 	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
    505 
    506 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
    507 	add	eax, 4
    508 	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
    509 	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
    510 	movaps	xmm1, xmm0			; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
    511 	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
    512 .warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
    513 	mulps	xmm0, xmm2
    514 	mulps	xmm1, xmm3			; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
    515 	addps	xmm5, xmm0
    516 	addps	xmm6, xmm1			; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
    517 	dec	edx
    518 	jz	.loop_end
    519 	ALIGN 16
    520 .loop_start:
    521 	; start by reading the next sample
    522 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
    523 	; here we reorder the instructions; see the (#) indexes for a logical order
    524 	shufps	xmm2, xmm2, 93h			; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
    525 	add	eax, 4				; (0)
    526 	shufps	xmm3, xmm3, 93h			; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
    527 	shufps	xmm0, xmm0, 0			; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
    528 	movss	xmm3, xmm2			; (5)
    529 	movaps	xmm1, xmm0			; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
    530 	movss	xmm2, xmm0			; (6)
    531 	mulps	xmm1, xmm3			; (8)
    532 	mulps	xmm0, xmm2			; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
    533 	addps	xmm6, xmm1			; (10)
    534 	addps	xmm5, xmm0			; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
    535 	dec	edx
    536 	jnz	.loop_start
    537 .loop_end:
    538 	; store autoc
    539 	mov	edx, [esp + 16]			; edx == autoc
    540 	movups	[edx], xmm5
    541 	movups	[edx + 16], xmm6
    542 
    543 .end:
    544 	ret
    545 
    546 	ALIGN 16
    547 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
    548 	;[esp + 16] == autoc[]
    549 	;[esp + 12] == lag
    550 	;[esp + 8] == data_len
    551 	;[esp + 4] == data[]
    552 
    553 	;ASSERT(lag > 0)
    554 	;ASSERT(lag <= 12)
    555 	;ASSERT(lag <= data_len)
    556 
    557 	;	for(coeff = 0; coeff < lag; coeff++)
    558 	;		autoc[coeff] = 0.0;
    559 	xorps	xmm5, xmm5
    560 	xorps	xmm6, xmm6
    561 	xorps	xmm7, xmm7
    562 
    563 	mov	edx, [esp + 8]			; edx == data_len
    564 	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
    565 
    566 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
    567 	add	eax, 4
    568 	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
    569 	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
    570 	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
    571 	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
    572 .warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
    573 	movaps	xmm1, xmm0
    574 	mulps	xmm1, xmm2
    575 	addps	xmm5, xmm1
    576 	movaps	xmm1, xmm0
    577 	mulps	xmm1, xmm3
    578 	addps	xmm6, xmm1
    579 	mulps	xmm0, xmm4
    580 	addps	xmm7, xmm0			; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
    581 	dec	edx
    582 	jz	.loop_end
    583 	ALIGN 16
    584 .loop_start:
    585 	; start by reading the next sample
    586 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
    587 	add	eax, 4
    588 	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
    589 
    590 	; shift xmm4:xmm3:xmm2 left by one float
    591 	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
    592 	shufps	xmm3, xmm3, 93h			; 93h=2-1-0-3 => xmm3 gets rotated left by one float
    593 	shufps	xmm4, xmm4, 93h			; 93h=2-1-0-3 => xmm4 gets rotated left by one float
    594 	movss	xmm4, xmm3
    595 	movss	xmm3, xmm2
    596 	movss	xmm2, xmm0
    597 
    598 	; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
    599 	movaps	xmm1, xmm0
    600 	mulps	xmm1, xmm2
    601 	addps	xmm5, xmm1
    602 	movaps	xmm1, xmm0
    603 	mulps	xmm1, xmm3
    604 	addps	xmm6, xmm1
    605 	mulps	xmm0, xmm4
    606 	addps	xmm7, xmm0
    607 
    608 	dec	edx
    609 	jnz	.loop_start
    610 .loop_end:
    611 	; store autoc
    612 	mov	edx, [esp + 16]			; edx == autoc
    613 	movups	[edx], xmm5
    614 	movups	[edx + 16], xmm6
    615 	movups	[edx + 32], xmm7
    616 
    617 .end:
    618 	ret
    619 
    620 	ALIGN 16
    621 cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
    622 	;[ebp + 32] autoc
    623 	;[ebp + 28] lag
    624 	;[ebp + 24] data_len
    625 	;[ebp + 20] data
    626 
    627 	push	ebp
    628 	push	ebx
    629 	push	esi
    630 	push	edi
    631 	mov	ebp, esp
    632 
    633 	mov	esi, [ebp + 20]
    634 	mov	edi, [ebp + 24]
    635 	mov	edx, [ebp + 28]
    636 	inc	edx
    637 	and	edx, byte -2
    638 	mov	eax, edx
    639 	neg	eax
    640 	and	esp, byte -8
    641 	lea	esp, [esp + 4 * eax]
    642 	mov	ecx, edx
    643 	xor	eax, eax
    644 .loop0:
    645 	dec	ecx
    646 	mov	[esp + 4 * ecx], eax
    647 	jnz	short .loop0
    648 
    649 	mov	eax, edi
    650 	sub	eax, edx
    651 	mov	ebx, edx
    652 	and	ebx, byte 1
    653 	sub	eax, ebx
    654 	lea	ecx, [esi + 4 * eax - 12]
    655 	cmp	esi, ecx
    656 	mov	eax, esi
    657 	ja	short .loop2_pre
    658 	ALIGN	16		;4 nops
    659 .loop1_i:
    660 	movd	mm0, [eax]
    661 	movd	mm2, [eax + 4]
    662 	movd	mm4, [eax + 8]
    663 	movd	mm6, [eax + 12]
    664 	mov	ebx, edx
    665 	punpckldq	mm0, mm0
    666 	punpckldq	mm2, mm2
    667 	punpckldq	mm4, mm4
    668 	punpckldq	mm6, mm6
    669 	ALIGN	16		;3 nops
    670 .loop1_j:
    671 	sub	ebx, byte 2
    672 	movd	mm1, [eax + 4 * ebx]
    673 	movd	mm3, [eax + 4 * ebx + 4]
    674 	movd	mm5, [eax + 4 * ebx + 8]
    675 	movd	mm7, [eax + 4 * ebx + 12]
    676 	punpckldq	mm1, mm3
    677 	punpckldq	mm3, mm5
    678 	pfmul	mm1, mm0
    679 	punpckldq	mm5, mm7
    680 	pfmul	mm3, mm2
    681 	punpckldq	mm7, [eax + 4 * ebx + 16]
    682 	pfmul	mm5, mm4
    683 	pfmul	mm7, mm6
    684 	pfadd	mm1, mm3
    685 	movq	mm3, [esp + 4 * ebx]
    686 	pfadd	mm5, mm7
    687 	pfadd	mm1, mm5
    688 	pfadd	mm3, mm1
    689 	movq	[esp + 4 * ebx], mm3
    690 	jg	short .loop1_j
    691 
    692 	add	eax, byte 16
    693 	cmp	eax, ecx
    694 	jb	short .loop1_i
    695 
    696 .loop2_pre:
    697 	mov	ebx, eax
    698 	sub	eax, esi
    699 	shr	eax, 2
    700 	lea	ecx, [esi + 4 * edi]
    701 	mov	esi, ebx
    702 .loop2_i:
    703 	movd	mm0, [esi]
    704 	mov	ebx, edi
    705 	sub	ebx, eax
    706 	cmp	ebx, edx
    707 	jbe	short .loop2_j
    708 	mov	ebx, edx
    709 .loop2_j:
    710 	dec	ebx
    711 	movd	mm1, [esi + 4 * ebx]
    712 	pfmul	mm1, mm0
    713 	movd	mm2, [esp + 4 * ebx]
    714 	pfadd	mm1, mm2
    715 	movd	[esp + 4 * ebx], mm1
    716 
    717 	jnz	short .loop2_j
    718 
    719 	add	esi, byte 4
    720 	inc	eax
    721 	cmp	esi, ecx
    722 	jnz	short .loop2_i
    723 
    724 	mov	edi, [ebp + 32]
    725 	mov	edx, [ebp + 28]
    726 .loop3:
    727 	dec	edx
    728 	mov	eax, [esp + 4 * edx]
    729 	mov	[edi + 4 * edx], eax
    730 	jnz	short .loop3
    731 
    732 	femms
    733 
    734 	mov	esp, ebp
    735 	pop	edi
    736 	pop	esi
    737 	pop	ebx
    738 	pop	ebp
    739 	ret
    740 
    741 ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
    742 ;
    743 ;	for(i = 0; i < data_len; i++) {
    744 ;		sum = 0;
    745 ;		for(j = 0; j < order; j++)
    746 ;			sum += qlp_coeff[j] * data[i-j-1];
    747 ;		residual[i] = data[i] - (sum >> lp_quantization);
    748 ;	}
    749 ;
    750 	ALIGN	16
    751 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
    752 	;[esp + 40]	residual[]
    753 	;[esp + 36]	lp_quantization
    754 	;[esp + 32]	order
    755 	;[esp + 28]	qlp_coeff[]
    756 	;[esp + 24]	data_len
    757 	;[esp + 20]	data[]
    758 
    759 	;ASSERT(order > 0)
    760 
    761 	push	ebp
    762 	push	ebx
    763 	push	esi
    764 	push	edi
    765 
    766 	mov	esi, [esp + 20]			; esi = data[]
    767 	mov	edi, [esp + 40]			; edi = residual[]
    768 	mov	eax, [esp + 32]			; eax = order
    769 	mov	ebx, [esp + 24]			; ebx = data_len
    770 
    771 	test	ebx, ebx
    772 	jz	near .end			; do nothing if data_len == 0
    773 .begin:
    774 	cmp	eax, byte 1
    775 	jg	short .i_1more
    776 
    777 	mov	ecx, [esp + 28]
    778 	mov	edx, [ecx]			; edx = qlp_coeff[0]
    779 	mov	eax, [esi - 4]			; eax = data[-1]
    780 	mov	cl, [esp + 36]			; cl = lp_quantization
    781 	ALIGN	16
    782 .i_1_loop_i:
    783 	imul	eax, edx
    784 	sar	eax, cl
    785 	neg	eax
    786 	add	eax, [esi]
    787 	mov	[edi], eax
    788 	mov	eax, [esi]
    789 	add	edi, byte 4
    790 	add	esi, byte 4
    791 	dec	ebx
    792 	jnz	.i_1_loop_i
    793 
    794 	jmp	.end
    795 
    796 .i_1more:
    797 	cmp	eax, byte 32			; for order <= 32 there is a faster routine
    798 	jbe	short .i_32
    799 
    800 	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
    801 	ALIGN 16
    802 .i_32more_loop_i:
    803 	xor	ebp, ebp
    804 	mov	ecx, [esp + 32]
    805 	mov	edx, ecx
    806 	shl	edx, 2
    807 	add	edx, [esp + 28]
    808 	neg	ecx
    809 	ALIGN	16
    810 .i_32more_loop_j:
    811 	sub	edx, byte 4
    812 	mov	eax, [edx]
    813 	imul	eax, [esi + 4 * ecx]
    814 	add	ebp, eax
    815 	inc	ecx
    816 	jnz	short .i_32more_loop_j
    817 
    818 	mov	cl, [esp + 36]
    819 	sar	ebp, cl
    820 	neg	ebp
    821 	add	ebp, [esi]
    822 	mov	[edi], ebp
    823 	add	esi, byte 4
    824 	add	edi, byte 4
    825 
    826 	dec	ebx
    827 	jnz	.i_32more_loop_i
    828 
    829 	jmp	.end
    830 
    831 .i_32:
    832 	sub	edi, esi
    833 	neg	eax
    834 	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
    835 	call	.get_eip0
    836 .get_eip0:
    837 	pop	eax
    838 	add	edx, eax
    839 	inc	edx
    840 	mov	eax, [esp + 28]			; eax = qlp_coeff[]
    841 	xor	ebp, ebp
    842 	jmp	edx
    843 
    844 	mov	ecx, [eax + 124]
    845 	imul	ecx, [esi - 128]
    846 	add	ebp, ecx
    847 	mov	ecx, [eax + 120]
    848 	imul	ecx, [esi - 124]
    849 	add	ebp, ecx
    850 	mov	ecx, [eax + 116]
    851 	imul	ecx, [esi - 120]
    852 	add	ebp, ecx
    853 	mov	ecx, [eax + 112]
    854 	imul	ecx, [esi - 116]
    855 	add	ebp, ecx
    856 	mov	ecx, [eax + 108]
    857 	imul	ecx, [esi - 112]
    858 	add	ebp, ecx
    859 	mov	ecx, [eax + 104]
    860 	imul	ecx, [esi - 108]
    861 	add	ebp, ecx
    862 	mov	ecx, [eax + 100]
    863 	imul	ecx, [esi - 104]
    864 	add	ebp, ecx
    865 	mov	ecx, [eax + 96]
    866 	imul	ecx, [esi - 100]
    867 	add	ebp, ecx
    868 	mov	ecx, [eax + 92]
    869 	imul	ecx, [esi - 96]
    870 	add	ebp, ecx
    871 	mov	ecx, [eax + 88]
    872 	imul	ecx, [esi - 92]
    873 	add	ebp, ecx
    874 	mov	ecx, [eax + 84]
    875 	imul	ecx, [esi - 88]
    876 	add	ebp, ecx
    877 	mov	ecx, [eax + 80]
    878 	imul	ecx, [esi - 84]
    879 	add	ebp, ecx
    880 	mov	ecx, [eax + 76]
    881 	imul	ecx, [esi - 80]
    882 	add	ebp, ecx
    883 	mov	ecx, [eax + 72]
    884 	imul	ecx, [esi - 76]
    885 	add	ebp, ecx
    886 	mov	ecx, [eax + 68]
    887 	imul	ecx, [esi - 72]
    888 	add	ebp, ecx
    889 	mov	ecx, [eax + 64]
    890 	imul	ecx, [esi - 68]
    891 	add	ebp, ecx
    892 	mov	ecx, [eax + 60]
    893 	imul	ecx, [esi - 64]
    894 	add	ebp, ecx
    895 	mov	ecx, [eax + 56]
    896 	imul	ecx, [esi - 60]
    897 	add	ebp, ecx
    898 	mov	ecx, [eax + 52]
    899 	imul	ecx, [esi - 56]
    900 	add	ebp, ecx
    901 	mov	ecx, [eax + 48]
    902 	imul	ecx, [esi - 52]
    903 	add	ebp, ecx
    904 	mov	ecx, [eax + 44]
    905 	imul	ecx, [esi - 48]
    906 	add	ebp, ecx
    907 	mov	ecx, [eax + 40]
    908 	imul	ecx, [esi - 44]
    909 	add	ebp, ecx
    910 	mov	ecx, [eax + 36]
    911 	imul	ecx, [esi - 40]
    912 	add	ebp, ecx
    913 	mov	ecx, [eax + 32]
    914 	imul	ecx, [esi - 36]
    915 	add	ebp, ecx
    916 	mov	ecx, [eax + 28]
    917 	imul	ecx, [esi - 32]
    918 	add	ebp, ecx
    919 	mov	ecx, [eax + 24]
    920 	imul	ecx, [esi - 28]
    921 	add	ebp, ecx
    922 	mov	ecx, [eax + 20]
    923 	imul	ecx, [esi - 24]
    924 	add	ebp, ecx
    925 	mov	ecx, [eax + 16]
    926 	imul	ecx, [esi - 20]
    927 	add	ebp, ecx
    928 	mov	ecx, [eax + 12]
    929 	imul	ecx, [esi - 16]
    930 	add	ebp, ecx
    931 	mov	ecx, [eax + 8]
    932 	imul	ecx, [esi - 12]
    933 	add	ebp, ecx
    934 	mov	ecx, [eax + 4]
    935 	imul	ecx, [esi - 8]
    936 	add	ebp, ecx
    937 	mov	ecx, [eax]			; there is one byte missing
    938 	imul	ecx, [esi - 4]
    939 	add	ebp, ecx
    940 .jumper_0:
    941 
    942 	mov	cl, [esp + 36]
    943 	sar	ebp, cl
    944 	neg	ebp
    945 	add	ebp, [esi]
    946 	mov	[edi + esi], ebp
    947 	add	esi, byte 4
    948 
    949 	dec	ebx
    950 	jz	short .end
    951 	xor	ebp, ebp
    952 	jmp	edx
    953 
    954 .end:
    955 	pop	edi
    956 	pop	esi
    957 	pop	ebx
    958 	pop	ebp
    959 	ret
    960 
    961 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
    962 ; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
    963 ; cannot be used for side-channel coded 16bps channels since the effective bps
    964 ; is 17.
    965 	ALIGN	16
    966 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
    967 	;[esp + 40]	residual[]
    968 	;[esp + 36]	lp_quantization
    969 	;[esp + 32]	order
    970 	;[esp + 28]	qlp_coeff[]
    971 	;[esp + 24]	data_len
    972 	;[esp + 20]	data[]
    973 
    974 	;ASSERT(order > 0)
    975 
    976 	push	ebp
    977 	push	ebx
    978 	push	esi
    979 	push	edi
    980 
    981 	mov	esi, [esp + 20]			; esi = data[]
    982 	mov	edi, [esp + 40]			; edi = residual[]
    983 	mov	eax, [esp + 32]			; eax = order
    984 	mov	ebx, [esp + 24]			; ebx = data_len
    985 
    986 	test	ebx, ebx
    987 	jz	near .end			; do nothing if data_len == 0
    988 	dec	ebx
    989 	test	ebx, ebx
    990 	jz	near .last_one
    991 
    992 	mov	edx, [esp + 28]			; edx = qlp_coeff[]
    993 	movd	mm6, [esp + 36]			; mm6 = 0:lp_quantization
    994 	mov	ebp, esp
    995 
    996 	and	esp, 0xfffffff8
    997 
    998 	xor	ecx, ecx
    999 .copy_qlp_loop:
   1000 	push	word [edx + 4 * ecx]
   1001 	inc	ecx
   1002 	cmp	ecx, eax
   1003 	jnz	short .copy_qlp_loop
   1004 
   1005 	and	ecx, 0x3
   1006 	test	ecx, ecx
   1007 	je	short .za_end
   1008 	sub	ecx, byte 4
   1009 .za_loop:
   1010 	push	word 0
   1011 	inc	eax
   1012 	inc	ecx
   1013 	jnz	short .za_loop
   1014 .za_end:
   1015 
   1016 	movq	mm5, [esp + 2 * eax - 8]
   1017 	movd	mm4, [esi - 16]
   1018 	punpckldq	mm4, [esi - 12]
   1019 	movd	mm0, [esi - 8]
   1020 	punpckldq	mm0, [esi - 4]
   1021 	packssdw	mm4, mm0
   1022 
   1023 	cmp	eax, byte 4
   1024 	jnbe	short .mmx_4more
   1025 
   1026 	ALIGN	16
   1027 .mmx_4_loop_i:
   1028 	movd	mm1, [esi]
   1029 	movq	mm3, mm4
   1030 	punpckldq	mm1, [esi + 4]
   1031 	psrlq	mm4, 16
   1032 	movq	mm0, mm1
   1033 	psllq	mm0, 48
   1034 	por	mm4, mm0
   1035 	movq	mm2, mm4
   1036 	psrlq	mm4, 16
   1037 	pxor	mm0, mm0
   1038 	punpckhdq	mm0, mm1
   1039 	pmaddwd	mm3, mm5
   1040 	pmaddwd	mm2, mm5
   1041 	psllq	mm0, 16
   1042 	por	mm4, mm0
   1043 	movq	mm0, mm3
   1044 	punpckldq	mm3, mm2
   1045 	punpckhdq	mm0, mm2
   1046 	paddd	mm3, mm0
   1047 	psrad	mm3, mm6
   1048 	psubd	mm1, mm3
   1049 	movd	[edi], mm1
   1050 	punpckhdq	mm1, mm1
   1051 	movd	[edi + 4], mm1
   1052 
   1053 	add	edi, byte 8
   1054 	add	esi, byte 8
   1055 
   1056 	sub	ebx, 2
   1057 	jg	.mmx_4_loop_i
   1058 	jmp	.mmx_end
   1059 
   1060 .mmx_4more:
   1061 	shl	eax, 2
   1062 	neg	eax
   1063 	add	eax, byte 16
   1064 
   1065 	ALIGN	16
   1066 .mmx_4more_loop_i:
   1067 	movd	mm1, [esi]
   1068 	punpckldq	mm1, [esi + 4]
   1069 	movq	mm3, mm4
   1070 	psrlq	mm4, 16
   1071 	movq	mm0, mm1
   1072 	psllq	mm0, 48
   1073 	por	mm4, mm0
   1074 	movq	mm2, mm4
   1075 	psrlq	mm4, 16
   1076 	pxor	mm0, mm0
   1077 	punpckhdq	mm0, mm1
   1078 	pmaddwd	mm3, mm5
   1079 	pmaddwd	mm2, mm5
   1080 	psllq	mm0, 16
   1081 	por	mm4, mm0
   1082 
   1083 	mov	ecx, esi
   1084 	add	ecx, eax
   1085 	mov	edx, esp
   1086 
   1087 	ALIGN	16
   1088 .mmx_4more_loop_j:
   1089 	movd	mm0, [ecx - 16]
   1090 	movd	mm7, [ecx - 8]
   1091 	punpckldq	mm0, [ecx - 12]
   1092 	punpckldq	mm7, [ecx - 4]
   1093 	packssdw	mm0, mm7
   1094 	pmaddwd	mm0, [edx]
   1095 	punpckhdq	mm7, mm7
   1096 	paddd	mm3, mm0
   1097 	movd	mm0, [ecx - 12]
   1098 	punpckldq	mm0, [ecx - 8]
   1099 	punpckldq	mm7, [ecx]
   1100 	packssdw	mm0, mm7
   1101 	pmaddwd	mm0, [edx]
   1102 	paddd	mm2, mm0
   1103 
   1104 	add	edx, byte 8
   1105 	add	ecx, byte 16
   1106 	cmp	ecx, esi
   1107 	jnz	.mmx_4more_loop_j
   1108 
   1109 	movq	mm0, mm3
   1110 	punpckldq	mm3, mm2
   1111 	punpckhdq	mm0, mm2
   1112 	paddd	mm3, mm0
   1113 	psrad	mm3, mm6
   1114 	psubd	mm1, mm3
   1115 	movd	[edi], mm1
   1116 	punpckhdq	mm1, mm1
   1117 	movd	[edi + 4], mm1
   1118 
   1119 	add	edi, byte 8
   1120 	add	esi, byte 8
   1121 
   1122 	sub	ebx, 2
   1123 	jg	near .mmx_4more_loop_i
   1124 
   1125 .mmx_end:
   1126 	emms
   1127 	mov	esp, ebp
   1128 .last_one:
   1129 	mov	eax, [esp + 32]
   1130 	inc	ebx
   1131 	jnz	near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
   1132 
   1133 .end:
   1134 	pop	edi
   1135 	pop	esi
   1136 	pop	ebx
   1137 	pop	ebp
   1138 	ret
   1139 
   1140 ; **********************************************************************
   1141 ;
   1142 ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
   1143 ; {
   1144 ; 	unsigned i, j;
   1145 ; 	FLAC__int32 sum;
   1146 ;
   1147 ; 	FLAC__ASSERT(order > 0);
   1148 ;
   1149 ; 	for(i = 0; i < data_len; i++) {
   1150 ; 		sum = 0;
   1151 ; 		for(j = 0; j < order; j++)
   1152 ; 			sum += qlp_coeff[j] * data[i-j-1];
   1153 ; 		data[i] = residual[i] + (sum >> lp_quantization);
   1154 ; 	}
   1155 ; }
   1156 	ALIGN	16
   1157 cident FLAC__lpc_restore_signal_asm_ia32
   1158 	;[esp + 40]	data[]
   1159 	;[esp + 36]	lp_quantization
   1160 	;[esp + 32]	order
   1161 	;[esp + 28]	qlp_coeff[]
   1162 	;[esp + 24]	data_len
   1163 	;[esp + 20]	residual[]
   1164 
   1165 	;ASSERT(order > 0)
   1166 
   1167 	push	ebp
   1168 	push	ebx
   1169 	push	esi
   1170 	push	edi
   1171 
   1172 	mov	esi, [esp + 20]			; esi = residual[]
   1173 	mov	edi, [esp + 40]			; edi = data[]
   1174 	mov	eax, [esp + 32]			; eax = order
   1175 	mov	ebx, [esp + 24]			; ebx = data_len
   1176 
   1177 	test	ebx, ebx
   1178 	jz	near .end			; do nothing if data_len == 0
   1179 
   1180 .begin:
   1181 	cmp	eax, byte 1
   1182 	jg	short .x87_1more
   1183 
   1184 	mov	ecx, [esp + 28]
   1185 	mov	edx, [ecx]
   1186 	mov	eax, [edi - 4]
   1187 	mov	cl, [esp + 36]
   1188 	ALIGN	16
   1189 .x87_1_loop_i:
   1190 	imul	eax, edx
   1191 	sar	eax, cl
   1192 	add	eax, [esi]
   1193 	mov	[edi], eax
   1194 	add	esi, byte 4
   1195 	add	edi, byte 4
   1196 	dec	ebx
   1197 	jnz	.x87_1_loop_i
   1198 
   1199 	jmp	.end
   1200 
   1201 .x87_1more:
   1202 	cmp	eax, byte 32			; for order <= 32 there is a faster routine
   1203 	jbe	short .x87_32
   1204 
   1205 	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
   1206 	ALIGN 16
   1207 .x87_32more_loop_i:
   1208 	xor	ebp, ebp
   1209 	mov	ecx, [esp + 32]
   1210 	mov	edx, ecx
   1211 	shl	edx, 2
   1212 	add	edx, [esp + 28]
   1213 	neg	ecx
   1214 	ALIGN	16
   1215 .x87_32more_loop_j:
   1216 	sub	edx, byte 4
   1217 	mov	eax, [edx]
   1218 	imul	eax, [edi + 4 * ecx]
   1219 	add	ebp, eax
   1220 	inc	ecx
   1221 	jnz	short .x87_32more_loop_j
   1222 
   1223 	mov	cl, [esp + 36]
   1224 	sar	ebp, cl
   1225 	add	ebp, [esi]
   1226 	mov	[edi], ebp
   1227 	add	edi, byte 4
   1228 	add	esi, byte 4
   1229 
   1230 	dec	ebx
   1231 	jnz	.x87_32more_loop_i
   1232 
   1233 	jmp	.end
   1234 
   1235 .x87_32:
   1236 	sub	esi, edi
   1237 	neg	eax
   1238 	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
   1239 	call	.get_eip0
   1240 .get_eip0:
   1241 	pop	eax
   1242 	add	edx, eax
   1243 	inc	edx				; compensate for the shorter opcode on the last iteration
   1244 	mov	eax, [esp + 28]			; eax = qlp_coeff[]
   1245 	xor	ebp, ebp
   1246 	jmp	edx
   1247 
   1248 	mov	ecx, [eax + 124]		; ecx =  qlp_coeff[31]
   1249 	imul	ecx, [edi - 128]		; ecx =  qlp_coeff[31] * data[i-32]
   1250 	add	ebp, ecx			; sum += qlp_coeff[31] * data[i-32]
   1251 	mov	ecx, [eax + 120]		; ecx =  qlp_coeff[30]
   1252 	imul	ecx, [edi - 124]		; ecx =  qlp_coeff[30] * data[i-31]
   1253 	add	ebp, ecx			; sum += qlp_coeff[30] * data[i-31]
   1254 	mov	ecx, [eax + 116]		; ecx =  qlp_coeff[29]
   1255 	imul	ecx, [edi - 120]		; ecx =  qlp_coeff[29] * data[i-30]
   1256 	add	ebp, ecx			; sum += qlp_coeff[29] * data[i-30]
   1257 	mov	ecx, [eax + 112]		; ecx =  qlp_coeff[28]
   1258 	imul	ecx, [edi - 116]		; ecx =  qlp_coeff[28] * data[i-29]
   1259 	add	ebp, ecx			; sum += qlp_coeff[28] * data[i-29]
   1260 	mov	ecx, [eax + 108]		; ecx =  qlp_coeff[27]
   1261 	imul	ecx, [edi - 112]		; ecx =  qlp_coeff[27] * data[i-28]
   1262 	add	ebp, ecx			; sum += qlp_coeff[27] * data[i-28]
   1263 	mov	ecx, [eax + 104]		; ecx =  qlp_coeff[26]
   1264 	imul	ecx, [edi - 108]		; ecx =  qlp_coeff[26] * data[i-27]
   1265 	add	ebp, ecx			; sum += qlp_coeff[26] * data[i-27]
   1266 	mov	ecx, [eax + 100]		; ecx =  qlp_coeff[25]
   1267 	imul	ecx, [edi - 104]		; ecx =  qlp_coeff[25] * data[i-26]
   1268 	add	ebp, ecx			; sum += qlp_coeff[25] * data[i-26]
   1269 	mov	ecx, [eax + 96]			; ecx =  qlp_coeff[24]
   1270 	imul	ecx, [edi - 100]		; ecx =  qlp_coeff[24] * data[i-25]
   1271 	add	ebp, ecx			; sum += qlp_coeff[24] * data[i-25]
   1272 	mov	ecx, [eax + 92]			; ecx =  qlp_coeff[23]
   1273 	imul	ecx, [edi - 96]			; ecx =  qlp_coeff[23] * data[i-24]
   1274 	add	ebp, ecx			; sum += qlp_coeff[23] * data[i-24]
   1275 	mov	ecx, [eax + 88]			; ecx =  qlp_coeff[22]
   1276 	imul	ecx, [edi - 92]			; ecx =  qlp_coeff[22] * data[i-23]
   1277 	add	ebp, ecx			; sum += qlp_coeff[22] * data[i-23]
   1278 	mov	ecx, [eax + 84]			; ecx =  qlp_coeff[21]
   1279 	imul	ecx, [edi - 88]			; ecx =  qlp_coeff[21] * data[i-22]
   1280 	add	ebp, ecx			; sum += qlp_coeff[21] * data[i-22]
   1281 	mov	ecx, [eax + 80]			; ecx =  qlp_coeff[20]
   1282 	imul	ecx, [edi - 84]			; ecx =  qlp_coeff[20] * data[i-21]
   1283 	add	ebp, ecx			; sum += qlp_coeff[20] * data[i-21]
   1284 	mov	ecx, [eax + 76]			; ecx =  qlp_coeff[19]
   1285 	imul	ecx, [edi - 80]			; ecx =  qlp_coeff[19] * data[i-20]
   1286 	add	ebp, ecx			; sum += qlp_coeff[19] * data[i-20]
   1287 	mov	ecx, [eax + 72]			; ecx =  qlp_coeff[18]
   1288 	imul	ecx, [edi - 76]			; ecx =  qlp_coeff[18] * data[i-19]
   1289 	add	ebp, ecx			; sum += qlp_coeff[18] * data[i-19]
   1290 	mov	ecx, [eax + 68]			; ecx =  qlp_coeff[17]
   1291 	imul	ecx, [edi - 72]			; ecx =  qlp_coeff[17] * data[i-18]
   1292 	add	ebp, ecx			; sum += qlp_coeff[17] * data[i-18]
   1293 	mov	ecx, [eax + 64]			; ecx =  qlp_coeff[16]
   1294 	imul	ecx, [edi - 68]			; ecx =  qlp_coeff[16] * data[i-17]
   1295 	add	ebp, ecx			; sum += qlp_coeff[16] * data[i-17]
   1296 	mov	ecx, [eax + 60]			; ecx =  qlp_coeff[15]
   1297 	imul	ecx, [edi - 64]			; ecx =  qlp_coeff[15] * data[i-16]
   1298 	add	ebp, ecx			; sum += qlp_coeff[15] * data[i-16]
   1299 	mov	ecx, [eax + 56]			; ecx =  qlp_coeff[14]
   1300 	imul	ecx, [edi - 60]			; ecx =  qlp_coeff[14] * data[i-15]
   1301 	add	ebp, ecx			; sum += qlp_coeff[14] * data[i-15]
   1302 	mov	ecx, [eax + 52]			; ecx =  qlp_coeff[13]
   1303 	imul	ecx, [edi - 56]			; ecx =  qlp_coeff[13] * data[i-14]
   1304 	add	ebp, ecx			; sum += qlp_coeff[13] * data[i-14]
   1305 	mov	ecx, [eax + 48]			; ecx =  qlp_coeff[12]
   1306 	imul	ecx, [edi - 52]			; ecx =  qlp_coeff[12] * data[i-13]
   1307 	add	ebp, ecx			; sum += qlp_coeff[12] * data[i-13]
   1308 	mov	ecx, [eax + 44]			; ecx =  qlp_coeff[11]
   1309 	imul	ecx, [edi - 48]			; ecx =  qlp_coeff[11] * data[i-12]
   1310 	add	ebp, ecx			; sum += qlp_coeff[11] * data[i-12]
   1311 	mov	ecx, [eax + 40]			; ecx =  qlp_coeff[10]
   1312 	imul	ecx, [edi - 44]			; ecx =  qlp_coeff[10] * data[i-11]
   1313 	add	ebp, ecx			; sum += qlp_coeff[10] * data[i-11]
   1314 	mov	ecx, [eax + 36]			; ecx =  qlp_coeff[ 9]
   1315 	imul	ecx, [edi - 40]			; ecx =  qlp_coeff[ 9] * data[i-10]
   1316 	add	ebp, ecx			; sum += qlp_coeff[ 9] * data[i-10]
   1317 	mov	ecx, [eax + 32]			; ecx =  qlp_coeff[ 8]
   1318 	imul	ecx, [edi - 36]			; ecx =  qlp_coeff[ 8] * data[i- 9]
   1319 	add	ebp, ecx			; sum += qlp_coeff[ 8] * data[i- 9]
   1320 	mov	ecx, [eax + 28]			; ecx =  qlp_coeff[ 7]
   1321 	imul	ecx, [edi - 32]			; ecx =  qlp_coeff[ 7] * data[i- 8]
   1322 	add	ebp, ecx			; sum += qlp_coeff[ 7] * data[i- 8]
   1323 	mov	ecx, [eax + 24]			; ecx =  qlp_coeff[ 6]
   1324 	imul	ecx, [edi - 28]			; ecx =  qlp_coeff[ 6] * data[i- 7]
   1325 	add	ebp, ecx			; sum += qlp_coeff[ 6] * data[i- 7]
   1326 	mov	ecx, [eax + 20]			; ecx =  qlp_coeff[ 5]
   1327 	imul	ecx, [edi - 24]			; ecx =  qlp_coeff[ 5] * data[i- 6]
   1328 	add	ebp, ecx			; sum += qlp_coeff[ 5] * data[i- 6]
   1329 	mov	ecx, [eax + 16]			; ecx =  qlp_coeff[ 4]
   1330 	imul	ecx, [edi - 20]			; ecx =  qlp_coeff[ 4] * data[i- 5]
   1331 	add	ebp, ecx			; sum += qlp_coeff[ 4] * data[i- 5]
   1332 	mov	ecx, [eax + 12]			; ecx =  qlp_coeff[ 3]
   1333 	imul	ecx, [edi - 16]			; ecx =  qlp_coeff[ 3] * data[i- 4]
   1334 	add	ebp, ecx			; sum += qlp_coeff[ 3] * data[i- 4]
   1335 	mov	ecx, [eax + 8]			; ecx =  qlp_coeff[ 2]
   1336 	imul	ecx, [edi - 12]			; ecx =  qlp_coeff[ 2] * data[i- 3]
   1337 	add	ebp, ecx			; sum += qlp_coeff[ 2] * data[i- 3]
   1338 	mov	ecx, [eax + 4]			; ecx =  qlp_coeff[ 1]
   1339 	imul	ecx, [edi - 8]			; ecx =  qlp_coeff[ 1] * data[i- 2]
   1340 	add	ebp, ecx			; sum += qlp_coeff[ 1] * data[i- 2]
   1341 	mov	ecx, [eax]			; ecx =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
   1342 	imul	ecx, [edi - 4]			; ecx =  qlp_coeff[ 0] * data[i- 1]
   1343 	add	ebp, ecx			; sum += qlp_coeff[ 0] * data[i- 1]
   1344 .jumper_0:
   1345 
   1346 	mov	cl, [esp + 36]
   1347 	sar	ebp, cl				; ebp = (sum >> lp_quantization)
   1348 	add	ebp, [esi + edi]		; ebp = residual[i] + (sum >> lp_quantization)
   1349 	mov	[edi], ebp			; data[i] = residual[i] + (sum >> lp_quantization)
   1350 	add	edi, byte 4
   1351 
   1352 	dec	ebx
   1353 	jz	short .end
   1354 	xor	ebp, ebp
   1355 	jmp	edx
   1356 
   1357 .end:
   1358 	pop	edi
   1359 	pop	esi
   1360 	pop	ebx
   1361 	pop	ebp
   1362 	ret
   1363 
   1364 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
   1365 ; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
   1366 ; cannot be used for side-channel coded 16bps channels since the effective bps
   1367 ; is 17.
   1368 ; WATCHOUT: this routine requires that each data array have a buffer of up to
   1369 ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
   1370 ; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
   1371 	ALIGN	16
   1372 cident FLAC__lpc_restore_signal_asm_ia32_mmx
   1373 	;[esp + 40]	data[]
   1374 	;[esp + 36]	lp_quantization
   1375 	;[esp + 32]	order
   1376 	;[esp + 28]	qlp_coeff[]
   1377 	;[esp + 24]	data_len
   1378 	;[esp + 20]	residual[]
   1379 
   1380 	;ASSERT(order > 0)
   1381 
   1382 	push	ebp
   1383 	push	ebx
   1384 	push	esi
   1385 	push	edi
   1386 
   1387 	mov	esi, [esp + 20]
   1388 	mov	edi, [esp + 40]
   1389 	mov	eax, [esp + 32]
   1390 	mov	ebx, [esp + 24]
   1391 
   1392 	test	ebx, ebx
   1393 	jz	near .end			; do nothing if data_len == 0
   1394 	cmp	eax, byte 4
   1395 	jb	near FLAC__lpc_restore_signal_asm_ia32.begin
   1396 
   1397 	mov	edx, [esp + 28]
   1398 	movd	mm6, [esp + 36]
   1399 	mov	ebp, esp
   1400 
   1401 	and	esp, 0xfffffff8
   1402 
   1403 	xor	ecx, ecx
   1404 .copy_qlp_loop:
   1405 	push	word [edx + 4 * ecx]
   1406 	inc	ecx
   1407 	cmp	ecx, eax
   1408 	jnz	short .copy_qlp_loop
   1409 
   1410 	and	ecx, 0x3
   1411 	test	ecx, ecx
   1412 	je	short .za_end
   1413 	sub	ecx, byte 4
   1414 .za_loop:
   1415 	push	word 0
   1416 	inc	eax
   1417 	inc	ecx
   1418 	jnz	short .za_loop
   1419 .za_end:
   1420 
   1421 	movq	mm5, [esp + 2 * eax - 8]
   1422 	movd	mm4, [edi - 16]
   1423 	punpckldq	mm4, [edi - 12]
   1424 	movd	mm0, [edi - 8]
   1425 	punpckldq	mm0, [edi - 4]
   1426 	packssdw	mm4, mm0
   1427 
   1428 	cmp	eax, byte 4
   1429 	jnbe	short .mmx_4more
   1430 
   1431 	ALIGN	16
   1432 .mmx_4_loop_i:
   1433 	movq	mm7, mm4
   1434 	pmaddwd	mm7, mm5
   1435 	movq	mm0, mm7
   1436 	punpckhdq	mm7, mm7
   1437 	paddd	mm7, mm0
   1438 	psrad	mm7, mm6
   1439 	movd	mm1, [esi]
   1440 	paddd	mm7, mm1
   1441 	movd	[edi], mm7
   1442 	psllq	mm7, 48
   1443 	psrlq	mm4, 16
   1444 	por	mm4, mm7
   1445 
   1446 	add	esi, byte 4
   1447 	add	edi, byte 4
   1448 
   1449 	dec	ebx
   1450 	jnz	.mmx_4_loop_i
   1451 	jmp	.mmx_end
   1452 .mmx_4more:
   1453 	shl	eax, 2
   1454 	neg	eax
   1455 	add	eax, byte 16
   1456 	ALIGN	16
   1457 .mmx_4more_loop_i:
   1458 	mov	ecx, edi
   1459 	add	ecx, eax
   1460 	mov	edx, esp
   1461 
   1462 	movq	mm7, mm4
   1463 	pmaddwd	mm7, mm5
   1464 
   1465 	ALIGN	16
   1466 .mmx_4more_loop_j:
   1467 	movd	mm0, [ecx - 16]
   1468 	punpckldq	mm0, [ecx - 12]
   1469 	movd	mm1, [ecx - 8]
   1470 	punpckldq	mm1, [ecx - 4]
   1471 	packssdw	mm0, mm1
   1472 	pmaddwd	mm0, [edx]
   1473 	paddd	mm7, mm0
   1474 
   1475 	add	edx, byte 8
   1476 	add	ecx, byte 16
   1477 	cmp	ecx, edi
   1478 	jnz	.mmx_4more_loop_j
   1479 
   1480 	movq	mm0, mm7
   1481 	punpckhdq	mm7, mm7
   1482 	paddd	mm7, mm0
   1483 	psrad	mm7, mm6
   1484 	movd	mm1, [esi]
   1485 	paddd	mm7, mm1
   1486 	movd	[edi], mm7
   1487 	psllq	mm7, 48
   1488 	psrlq	mm4, 16
   1489 	por	mm4, mm7
   1490 
   1491 	add	esi, byte 4
   1492 	add	edi, byte 4
   1493 
   1494 	dec	ebx
   1495 	jnz	short .mmx_4more_loop_i
   1496 .mmx_end:
   1497 	emms
   1498 	mov	esp, ebp
   1499 
   1500 .end:
   1501 	pop	edi
   1502 	pop	esi
   1503 	pop	ebx
   1504 	pop	ebp
   1505 	ret
   1506 
   1507 end
   1508 
   1509 %ifdef OBJ_FORMAT_elf
   1510        section .note.GNU-stack noalloc
   1511 %endif
   1512