1 ; vim:filetype=nasm ts=8 2 3 ; libFLAC - Free Lossless Audio Codec library 4 ; Copyright (C) 2001,2002,2003,2004,2005,2006,2007 Josh Coalson 5 ; 6 ; Redistribution and use in source and binary forms, with or without 7 ; modification, are permitted provided that the following conditions 8 ; are met: 9 ; 10 ; - Redistributions of source code must retain the above copyright 11 ; notice, this list of conditions and the following disclaimer. 12 ; 13 ; - Redistributions in binary form must reproduce the above copyright 14 ; notice, this list of conditions and the following disclaimer in the 15 ; documentation and/or other materials provided with the distribution. 16 ; 17 ; - Neither the name of the Xiph.org Foundation nor the names of its 18 ; contributors may be used to endorse or promote products derived from 19 ; this software without specific prior written permission. 20 ; 21 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 25 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 33 %include "nasm.h" 34 35 data_section 36 37 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32 38 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 39 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 40 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 41 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow 42 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 43 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx 44 cglobal FLAC__lpc_restore_signal_asm_ia32 45 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx 46 47 code_section 48 49 ; ********************************************************************** 50 ; 51 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 52 ; { 53 ; FLAC__real d; 54 ; unsigned sample, coeff; 55 ; const unsigned limit = data_len - lag; 56 ; 57 ; FLAC__ASSERT(lag > 0); 58 ; FLAC__ASSERT(lag <= data_len); 59 ; 60 ; for(coeff = 0; coeff < lag; coeff++) 61 ; autoc[coeff] = 0.0; 62 ; for(sample = 0; sample <= limit; sample++) { 63 ; d = data[sample]; 64 ; for(coeff = 0; coeff < lag; coeff++) 65 ; autoc[coeff] += d * data[sample+coeff]; 66 ; } 67 ; for(; sample < data_len; sample++) { 68 ; d = data[sample]; 69 ; for(coeff = 0; coeff < data_len - sample; coeff++) 70 ; autoc[coeff] += d * data[sample+coeff]; 71 ; } 72 ; } 73 ; 74 ALIGN 16 75 cident FLAC__lpc_compute_autocorrelation_asm_ia32 76 ;[esp + 28] == autoc[] 77 ;[esp + 24] == lag 78 ;[esp + 20] == data_len 79 ;[esp + 16] == data[] 80 81 ;ASSERT(lag > 0) 82 ;ASSERT(lag <= 33) 83 ;ASSERT(lag <= data_len) 84 85 .begin: 86 push esi 87 push edi 88 push ebx 89 90 ; for(coeff = 0; coeff < lag; coeff++) 91 ; autoc[coeff] = 0.0; 92 mov edi, [esp + 28] ; edi == autoc 93 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write 94 xor eax, eax 95 rep stosd 96 97 ; const unsigned limit = data_len - lag; 98 mov eax, [esp + 24] ; eax == lag 99 mov ecx, [esp + 20] 100 sub ecx, eax ; ecx == limit 101 102 mov edi, [esp + 28] ; edi == autoc 103 mov esi, [esp + 16] ; esi == data 104 inc ecx ; we are looping <= limit so we add one to the counter 105 106 ; for(sample = 0; sample <= limit; sample++) { 107 ; d = data[sample]; 108 ; for(coeff = 0; coeff < lag; coeff++) 109 ; autoc[coeff] += d * data[sample+coeff]; 110 ; } 111 fld dword [esi] ; ST = d <- data[sample] 112 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) 113 lea edx, [eax + eax*2] 114 neg edx 115 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1] 116 call .get_eip1 117 .get_eip1: 118 pop ebx 119 add edx, ebx 120 inc edx ; compensate for the shorter opcode on the last iteration 121 inc edx ; compensate for the shorter opcode on the last iteration 122 inc edx ; compensate for the shorter opcode on the last iteration 123 cmp eax, 33 124 jne .loop1_start 125 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration 126 .loop1_start: 127 jmp edx 128 129 fld st0 ; ST = d d 130 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here! 131 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here! 132 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here! 133 fld st0 ; ST = d d 134 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d 135 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d 136 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d 137 fld st0 ; ST = d d 138 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d 139 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d 140 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d 141 fld st0 ; ST = d d 142 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d 143 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d 144 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d 145 fld st0 ; ST = d d 146 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d 147 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d 148 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d 149 fld st0 ; ST = d d 150 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d 151 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d 152 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d 153 fld st0 ; ST = d d 154 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d 155 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d 156 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d 157 fld st0 ; ST = d d 158 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d 159 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d 160 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d 161 fld st0 ; ST = d d 162 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d 163 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d 164 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d 165 fld st0 ; ST = d d 166 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d 167 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d 168 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d 169 fld st0 ; ST = d d 170 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d 171 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d 172 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d 173 fld st0 ; ST = d d 174 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d 175 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d 176 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d 177 fld st0 ; ST = d d 178 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d 179 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d 180 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d 181 fld st0 ; ST = d d 182 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d 183 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d 184 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d 185 fld st0 ; ST = d d 186 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d 187 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d 188 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d 189 fld st0 ; ST = d d 190 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d 191 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d 192 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d 193 fld st0 ; ST = d d 194 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d 195 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d 196 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d 197 fld st0 ; ST = d d 198 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d 199 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d 200 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d 201 fld st0 ; ST = d d 202 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d 203 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d 204 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d 205 fld st0 ; ST = d d 206 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d 207 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d 208 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d 209 fld st0 ; ST = d d 210 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d 211 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d 212 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d 213 fld st0 ; ST = d d 214 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d 215 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d 216 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d 217 fld st0 ; ST = d d 218 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d 219 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d 220 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d 221 fld st0 ; ST = d d 222 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d 223 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d 224 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d 225 fld st0 ; ST = d d 226 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d 227 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d 228 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d 229 fld st0 ; ST = d d 230 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d 231 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d 232 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d 233 fld st0 ; ST = d d 234 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d 235 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d 236 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d 237 fld st0 ; ST = d d 238 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d 239 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d 240 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d 241 fld st0 ; ST = d d 242 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d 243 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d 244 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d 245 fld st0 ; ST = d d 246 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d 247 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d 248 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d 249 fld st0 ; ST = d d 250 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d 251 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d 252 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d 253 fld st0 ; ST = d d 254 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d 255 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d 256 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d 257 fld st0 ; ST = d d 258 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! 259 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! 260 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! 261 .jumper1_0: 262 263 fstp st0 ; pop d, ST = empty 264 add esi, byte 4 ; sample++ 265 dec ecx 266 jz .loop1_end 267 fld dword [esi] ; ST = d <- data[sample] 268 jmp edx 269 .loop1_end: 270 271 ; for(; sample < data_len; sample++) { 272 ; d = data[sample]; 273 ; for(coeff = 0; coeff < data_len - sample; coeff++) 274 ; autoc[coeff] += d * data[sample+coeff]; 275 ; } 276 mov ecx, [esp + 24] ; ecx <- lag 277 dec ecx ; ecx <- lag - 1 278 jz near .end ; skip loop if 0 (i.e. lag == 1) 279 280 fld dword [esi] ; ST = d <- data[sample] 281 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through 282 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) 283 lea edx, [eax + eax*2] 284 neg edx 285 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2] 286 call .get_eip2 287 .get_eip2: 288 pop ebx 289 add edx, ebx 290 inc edx ; compensate for the shorter opcode on the last iteration 291 inc edx ; compensate for the shorter opcode on the last iteration 292 inc edx ; compensate for the shorter opcode on the last iteration 293 jmp edx 294 295 fld st0 ; ST = d d 296 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d 297 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d 298 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d 299 fld st0 ; ST = d d 300 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d 301 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d 302 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d 303 fld st0 ; ST = d d 304 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d 305 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d 306 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d 307 fld st0 ; ST = d d 308 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d 309 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d 310 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d 311 fld st0 ; ST = d d 312 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d 313 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d 314 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d 315 fld st0 ; ST = d d 316 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d 317 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d 318 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d 319 fld st0 ; ST = d d 320 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d 321 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d 322 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d 323 fld st0 ; ST = d d 324 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d 325 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d 326 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d 327 fld st0 ; ST = d d 328 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d 329 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d 330 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d 331 fld st0 ; ST = d d 332 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d 333 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d 334 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d 335 fld st0 ; ST = d d 336 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d 337 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d 338 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d 339 fld st0 ; ST = d d 340 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d 341 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d 342 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d 343 fld st0 ; ST = d d 344 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d 345 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d 346 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d 347 fld st0 ; ST = d d 348 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d 349 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d 350 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d 351 fld st0 ; ST = d d 352 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d 353 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d 354 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d 355 fld st0 ; ST = d d 356 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d 357 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d 358 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d 359 fld st0 ; ST = d d 360 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d 361 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d 362 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d 363 fld st0 ; ST = d d 364 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d 365 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d 366 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d 367 fld st0 ; ST = d d 368 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d 369 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d 370 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d 371 fld st0 ; ST = d d 372 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d 373 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d 374 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d 375 fld st0 ; ST = d d 376 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d 377 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d 378 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d 379 fld st0 ; ST = d d 380 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d 381 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d 382 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d 383 fld st0 ; ST = d d 384 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d 385 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d 386 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d 387 fld st0 ; ST = d d 388 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d 389 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d 390 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d 391 fld st0 ; ST = d d 392 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d 393 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d 394 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d 395 fld st0 ; ST = d d 396 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d 397 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d 398 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d 399 fld st0 ; ST = d d 400 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d 401 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d 402 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d 403 fld st0 ; ST = d d 404 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d 405 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d 406 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d 407 fld st0 ; ST = d d 408 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d 409 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d 410 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d 411 fld st0 ; ST = d d 412 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d 413 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d 414 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d 415 fld st0 ; ST = d d 416 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d 417 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d 418 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d 419 fld st0 ; ST = d d 420 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! 421 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! 422 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! 423 .jumper2_0: 424 425 fstp st0 ; pop d, ST = empty 426 add esi, byte 4 ; sample++ 427 dec ecx 428 jz .loop2_end 429 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target 430 fld dword [esi] ; ST = d <- data[sample] 431 jmp edx 432 .loop2_end: 433 434 .end: 435 pop ebx 436 pop edi 437 pop esi 438 ret 439 440 ALIGN 16 441 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 442 ;[esp + 16] == autoc[] 443 ;[esp + 12] == lag 444 ;[esp + 8] == data_len 445 ;[esp + 4] == data[] 446 447 ;ASSERT(lag > 0) 448 ;ASSERT(lag <= 4) 449 ;ASSERT(lag <= data_len) 450 451 ; for(coeff = 0; coeff < lag; coeff++) 452 ; autoc[coeff] = 0.0; 453 xorps xmm5, xmm5 454 455 mov edx, [esp + 8] ; edx == data_len 456 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] 457 458 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 459 add eax, 4 460 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] 461 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 462 .warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample] 463 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 464 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 465 dec edx 466 jz .loop_end 467 ALIGN 16 468 .loop_start: 469 ; start by reading the next sample 470 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 471 add eax, 4 472 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] 473 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float 474 movss xmm2, xmm0 475 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 476 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 477 dec edx 478 jnz .loop_start 479 .loop_end: 480 ; store autoc 481 mov edx, [esp + 16] ; edx == autoc 482 movups [edx], xmm5 483 484 .end: 485 ret 486 487 ALIGN 16 488 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 489 ;[esp + 16] == autoc[] 490 ;[esp + 12] == lag 491 ;[esp + 8] == data_len 492 ;[esp + 4] == data[] 493 494 ;ASSERT(lag > 0) 495 ;ASSERT(lag <= 8) 496 ;ASSERT(lag <= data_len) 497 498 ; for(coeff = 0; coeff < lag; coeff++) 499 ; autoc[coeff] = 0.0; 500 xorps xmm5, xmm5 501 xorps xmm6, xmm6 502 503 mov edx, [esp + 8] ; edx == data_len 504 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] 505 506 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 507 add eax, 4 508 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] 509 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 510 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 511 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 512 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] 513 mulps xmm0, xmm2 514 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 515 addps xmm5, xmm0 516 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 517 dec edx 518 jz .loop_end 519 ALIGN 16 520 .loop_start: 521 ; start by reading the next sample 522 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 523 ; here we reorder the instructions; see the (#) indexes for a logical order 524 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float 525 add eax, 4 ; (0) 526 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float 527 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample] 528 movss xmm3, xmm2 ; (5) 529 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample] 530 movss xmm2, xmm0 ; (6) 531 mulps xmm1, xmm3 ; (8) 532 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 533 addps xmm6, xmm1 ; (10) 534 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 535 dec edx 536 jnz .loop_start 537 .loop_end: 538 ; store autoc 539 mov edx, [esp + 16] ; edx == autoc 540 movups [edx], xmm5 541 movups [edx + 16], xmm6 542 543 .end: 544 ret 545 546 ALIGN 16 547 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 548 ;[esp + 16] == autoc[] 549 ;[esp + 12] == lag 550 ;[esp + 8] == data_len 551 ;[esp + 4] == data[] 552 553 ;ASSERT(lag > 0) 554 ;ASSERT(lag <= 12) 555 ;ASSERT(lag <= data_len) 556 557 ; for(coeff = 0; coeff < lag; coeff++) 558 ; autoc[coeff] = 0.0; 559 xorps xmm5, xmm5 560 xorps xmm6, xmm6 561 xorps xmm7, xmm7 562 563 mov edx, [esp + 8] ; edx == data_len 564 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] 565 566 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 567 add eax, 4 568 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] 569 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 570 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 571 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 572 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] 573 movaps xmm1, xmm0 574 mulps xmm1, xmm2 575 addps xmm5, xmm1 576 movaps xmm1, xmm0 577 mulps xmm1, xmm3 578 addps xmm6, xmm1 579 mulps xmm0, xmm4 580 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 581 dec edx 582 jz .loop_end 583 ALIGN 16 584 .loop_start: 585 ; start by reading the next sample 586 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 587 add eax, 4 588 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] 589 590 ; shift xmm4:xmm3:xmm2 left by one float 591 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float 592 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float 593 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float 594 movss xmm4, xmm3 595 movss xmm3, xmm2 596 movss xmm2, xmm0 597 598 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2 599 movaps xmm1, xmm0 600 mulps xmm1, xmm2 601 addps xmm5, xmm1 602 movaps xmm1, xmm0 603 mulps xmm1, xmm3 604 addps xmm6, xmm1 605 mulps xmm0, xmm4 606 addps xmm7, xmm0 607 608 dec edx 609 jnz .loop_start 610 .loop_end: 611 ; store autoc 612 mov edx, [esp + 16] ; edx == autoc 613 movups [edx], xmm5 614 movups [edx + 16], xmm6 615 movups [edx + 32], xmm7 616 617 .end: 618 ret 619 620 ALIGN 16 621 cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow 622 ;[ebp + 32] autoc 623 ;[ebp + 28] lag 624 ;[ebp + 24] data_len 625 ;[ebp + 20] data 626 627 push ebp 628 push ebx 629 push esi 630 push edi 631 mov ebp, esp 632 633 mov esi, [ebp + 20] 634 mov edi, [ebp + 24] 635 mov edx, [ebp + 28] 636 inc edx 637 and edx, byte -2 638 mov eax, edx 639 neg eax 640 and esp, byte -8 641 lea esp, [esp + 4 * eax] 642 mov ecx, edx 643 xor eax, eax 644 .loop0: 645 dec ecx 646 mov [esp + 4 * ecx], eax 647 jnz short .loop0 648 649 mov eax, edi 650 sub eax, edx 651 mov ebx, edx 652 and ebx, byte 1 653 sub eax, ebx 654 lea ecx, [esi + 4 * eax - 12] 655 cmp esi, ecx 656 mov eax, esi 657 ja short .loop2_pre 658 ALIGN 16 ;4 nops 659 .loop1_i: 660 movd mm0, [eax] 661 movd mm2, [eax + 4] 662 movd mm4, [eax + 8] 663 movd mm6, [eax + 12] 664 mov ebx, edx 665 punpckldq mm0, mm0 666 punpckldq mm2, mm2 667 punpckldq mm4, mm4 668 punpckldq mm6, mm6 669 ALIGN 16 ;3 nops 670 .loop1_j: 671 sub ebx, byte 2 672 movd mm1, [eax + 4 * ebx] 673 movd mm3, [eax + 4 * ebx + 4] 674 movd mm5, [eax + 4 * ebx + 8] 675 movd mm7, [eax + 4 * ebx + 12] 676 punpckldq mm1, mm3 677 punpckldq mm3, mm5 678 pfmul mm1, mm0 679 punpckldq mm5, mm7 680 pfmul mm3, mm2 681 punpckldq mm7, [eax + 4 * ebx + 16] 682 pfmul mm5, mm4 683 pfmul mm7, mm6 684 pfadd mm1, mm3 685 movq mm3, [esp + 4 * ebx] 686 pfadd mm5, mm7 687 pfadd mm1, mm5 688 pfadd mm3, mm1 689 movq [esp + 4 * ebx], mm3 690 jg short .loop1_j 691 692 add eax, byte 16 693 cmp eax, ecx 694 jb short .loop1_i 695 696 .loop2_pre: 697 mov ebx, eax 698 sub eax, esi 699 shr eax, 2 700 lea ecx, [esi + 4 * edi] 701 mov esi, ebx 702 .loop2_i: 703 movd mm0, [esi] 704 mov ebx, edi 705 sub ebx, eax 706 cmp ebx, edx 707 jbe short .loop2_j 708 mov ebx, edx 709 .loop2_j: 710 dec ebx 711 movd mm1, [esi + 4 * ebx] 712 pfmul mm1, mm0 713 movd mm2, [esp + 4 * ebx] 714 pfadd mm1, mm2 715 movd [esp + 4 * ebx], mm1 716 717 jnz short .loop2_j 718 719 add esi, byte 4 720 inc eax 721 cmp esi, ecx 722 jnz short .loop2_i 723 724 mov edi, [ebp + 32] 725 mov edx, [ebp + 28] 726 .loop3: 727 dec edx 728 mov eax, [esp + 4 * edx] 729 mov [edi + 4 * edx], eax 730 jnz short .loop3 731 732 femms 733 734 mov esp, ebp 735 pop edi 736 pop esi 737 pop ebx 738 pop ebp 739 ret 740 741 ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 742 ; 743 ; for(i = 0; i < data_len; i++) { 744 ; sum = 0; 745 ; for(j = 0; j < order; j++) 746 ; sum += qlp_coeff[j] * data[i-j-1]; 747 ; residual[i] = data[i] - (sum >> lp_quantization); 748 ; } 749 ; 750 ALIGN 16 751 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 752 ;[esp + 40] residual[] 753 ;[esp + 36] lp_quantization 754 ;[esp + 32] order 755 ;[esp + 28] qlp_coeff[] 756 ;[esp + 24] data_len 757 ;[esp + 20] data[] 758 759 ;ASSERT(order > 0) 760 761 push ebp 762 push ebx 763 push esi 764 push edi 765 766 mov esi, [esp + 20] ; esi = data[] 767 mov edi, [esp + 40] ; edi = residual[] 768 mov eax, [esp + 32] ; eax = order 769 mov ebx, [esp + 24] ; ebx = data_len 770 771 test ebx, ebx 772 jz near .end ; do nothing if data_len == 0 773 .begin: 774 cmp eax, byte 1 775 jg short .i_1more 776 777 mov ecx, [esp + 28] 778 mov edx, [ecx] ; edx = qlp_coeff[0] 779 mov eax, [esi - 4] ; eax = data[-1] 780 mov cl, [esp + 36] ; cl = lp_quantization 781 ALIGN 16 782 .i_1_loop_i: 783 imul eax, edx 784 sar eax, cl 785 neg eax 786 add eax, [esi] 787 mov [edi], eax 788 mov eax, [esi] 789 add edi, byte 4 790 add esi, byte 4 791 dec ebx 792 jnz .i_1_loop_i 793 794 jmp .end 795 796 .i_1more: 797 cmp eax, byte 32 ; for order <= 32 there is a faster routine 798 jbe short .i_32 799 800 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 801 ALIGN 16 802 .i_32more_loop_i: 803 xor ebp, ebp 804 mov ecx, [esp + 32] 805 mov edx, ecx 806 shl edx, 2 807 add edx, [esp + 28] 808 neg ecx 809 ALIGN 16 810 .i_32more_loop_j: 811 sub edx, byte 4 812 mov eax, [edx] 813 imul eax, [esi + 4 * ecx] 814 add ebp, eax 815 inc ecx 816 jnz short .i_32more_loop_j 817 818 mov cl, [esp + 36] 819 sar ebp, cl 820 neg ebp 821 add ebp, [esi] 822 mov [edi], ebp 823 add esi, byte 4 824 add edi, byte 4 825 826 dec ebx 827 jnz .i_32more_loop_i 828 829 jmp .end 830 831 .i_32: 832 sub edi, esi 833 neg eax 834 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] 835 call .get_eip0 836 .get_eip0: 837 pop eax 838 add edx, eax 839 inc edx 840 mov eax, [esp + 28] ; eax = qlp_coeff[] 841 xor ebp, ebp 842 jmp edx 843 844 mov ecx, [eax + 124] 845 imul ecx, [esi - 128] 846 add ebp, ecx 847 mov ecx, [eax + 120] 848 imul ecx, [esi - 124] 849 add ebp, ecx 850 mov ecx, [eax + 116] 851 imul ecx, [esi - 120] 852 add ebp, ecx 853 mov ecx, [eax + 112] 854 imul ecx, [esi - 116] 855 add ebp, ecx 856 mov ecx, [eax + 108] 857 imul ecx, [esi - 112] 858 add ebp, ecx 859 mov ecx, [eax + 104] 860 imul ecx, [esi - 108] 861 add ebp, ecx 862 mov ecx, [eax + 100] 863 imul ecx, [esi - 104] 864 add ebp, ecx 865 mov ecx, [eax + 96] 866 imul ecx, [esi - 100] 867 add ebp, ecx 868 mov ecx, [eax + 92] 869 imul ecx, [esi - 96] 870 add ebp, ecx 871 mov ecx, [eax + 88] 872 imul ecx, [esi - 92] 873 add ebp, ecx 874 mov ecx, [eax + 84] 875 imul ecx, [esi - 88] 876 add ebp, ecx 877 mov ecx, [eax + 80] 878 imul ecx, [esi - 84] 879 add ebp, ecx 880 mov ecx, [eax + 76] 881 imul ecx, [esi - 80] 882 add ebp, ecx 883 mov ecx, [eax + 72] 884 imul ecx, [esi - 76] 885 add ebp, ecx 886 mov ecx, [eax + 68] 887 imul ecx, [esi - 72] 888 add ebp, ecx 889 mov ecx, [eax + 64] 890 imul ecx, [esi - 68] 891 add ebp, ecx 892 mov ecx, [eax + 60] 893 imul ecx, [esi - 64] 894 add ebp, ecx 895 mov ecx, [eax + 56] 896 imul ecx, [esi - 60] 897 add ebp, ecx 898 mov ecx, [eax + 52] 899 imul ecx, [esi - 56] 900 add ebp, ecx 901 mov ecx, [eax + 48] 902 imul ecx, [esi - 52] 903 add ebp, ecx 904 mov ecx, [eax + 44] 905 imul ecx, [esi - 48] 906 add ebp, ecx 907 mov ecx, [eax + 40] 908 imul ecx, [esi - 44] 909 add ebp, ecx 910 mov ecx, [eax + 36] 911 imul ecx, [esi - 40] 912 add ebp, ecx 913 mov ecx, [eax + 32] 914 imul ecx, [esi - 36] 915 add ebp, ecx 916 mov ecx, [eax + 28] 917 imul ecx, [esi - 32] 918 add ebp, ecx 919 mov ecx, [eax + 24] 920 imul ecx, [esi - 28] 921 add ebp, ecx 922 mov ecx, [eax + 20] 923 imul ecx, [esi - 24] 924 add ebp, ecx 925 mov ecx, [eax + 16] 926 imul ecx, [esi - 20] 927 add ebp, ecx 928 mov ecx, [eax + 12] 929 imul ecx, [esi - 16] 930 add ebp, ecx 931 mov ecx, [eax + 8] 932 imul ecx, [esi - 12] 933 add ebp, ecx 934 mov ecx, [eax + 4] 935 imul ecx, [esi - 8] 936 add ebp, ecx 937 mov ecx, [eax] ; there is one byte missing 938 imul ecx, [esi - 4] 939 add ebp, ecx 940 .jumper_0: 941 942 mov cl, [esp + 36] 943 sar ebp, cl 944 neg ebp 945 add ebp, [esi] 946 mov [edi + esi], ebp 947 add esi, byte 4 948 949 dec ebx 950 jz short .end 951 xor ebp, ebp 952 jmp edx 953 954 .end: 955 pop edi 956 pop esi 957 pop ebx 958 pop ebp 959 ret 960 961 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for 962 ; the channel and qlp_coeffs must be <= 16. Especially note that this routine 963 ; cannot be used for side-channel coded 16bps channels since the effective bps 964 ; is 17. 965 ALIGN 16 966 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx 967 ;[esp + 40] residual[] 968 ;[esp + 36] lp_quantization 969 ;[esp + 32] order 970 ;[esp + 28] qlp_coeff[] 971 ;[esp + 24] data_len 972 ;[esp + 20] data[] 973 974 ;ASSERT(order > 0) 975 976 push ebp 977 push ebx 978 push esi 979 push edi 980 981 mov esi, [esp + 20] ; esi = data[] 982 mov edi, [esp + 40] ; edi = residual[] 983 mov eax, [esp + 32] ; eax = order 984 mov ebx, [esp + 24] ; ebx = data_len 985 986 test ebx, ebx 987 jz near .end ; do nothing if data_len == 0 988 dec ebx 989 test ebx, ebx 990 jz near .last_one 991 992 mov edx, [esp + 28] ; edx = qlp_coeff[] 993 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization 994 mov ebp, esp 995 996 and esp, 0xfffffff8 997 998 xor ecx, ecx 999 .copy_qlp_loop: 1000 push word [edx + 4 * ecx] 1001 inc ecx 1002 cmp ecx, eax 1003 jnz short .copy_qlp_loop 1004 1005 and ecx, 0x3 1006 test ecx, ecx 1007 je short .za_end 1008 sub ecx, byte 4 1009 .za_loop: 1010 push word 0 1011 inc eax 1012 inc ecx 1013 jnz short .za_loop 1014 .za_end: 1015 1016 movq mm5, [esp + 2 * eax - 8] 1017 movd mm4, [esi - 16] 1018 punpckldq mm4, [esi - 12] 1019 movd mm0, [esi - 8] 1020 punpckldq mm0, [esi - 4] 1021 packssdw mm4, mm0 1022 1023 cmp eax, byte 4 1024 jnbe short .mmx_4more 1025 1026 ALIGN 16 1027 .mmx_4_loop_i: 1028 movd mm1, [esi] 1029 movq mm3, mm4 1030 punpckldq mm1, [esi + 4] 1031 psrlq mm4, 16 1032 movq mm0, mm1 1033 psllq mm0, 48 1034 por mm4, mm0 1035 movq mm2, mm4 1036 psrlq mm4, 16 1037 pxor mm0, mm0 1038 punpckhdq mm0, mm1 1039 pmaddwd mm3, mm5 1040 pmaddwd mm2, mm5 1041 psllq mm0, 16 1042 por mm4, mm0 1043 movq mm0, mm3 1044 punpckldq mm3, mm2 1045 punpckhdq mm0, mm2 1046 paddd mm3, mm0 1047 psrad mm3, mm6 1048 psubd mm1, mm3 1049 movd [edi], mm1 1050 punpckhdq mm1, mm1 1051 movd [edi + 4], mm1 1052 1053 add edi, byte 8 1054 add esi, byte 8 1055 1056 sub ebx, 2 1057 jg .mmx_4_loop_i 1058 jmp .mmx_end 1059 1060 .mmx_4more: 1061 shl eax, 2 1062 neg eax 1063 add eax, byte 16 1064 1065 ALIGN 16 1066 .mmx_4more_loop_i: 1067 movd mm1, [esi] 1068 punpckldq mm1, [esi + 4] 1069 movq mm3, mm4 1070 psrlq mm4, 16 1071 movq mm0, mm1 1072 psllq mm0, 48 1073 por mm4, mm0 1074 movq mm2, mm4 1075 psrlq mm4, 16 1076 pxor mm0, mm0 1077 punpckhdq mm0, mm1 1078 pmaddwd mm3, mm5 1079 pmaddwd mm2, mm5 1080 psllq mm0, 16 1081 por mm4, mm0 1082 1083 mov ecx, esi 1084 add ecx, eax 1085 mov edx, esp 1086 1087 ALIGN 16 1088 .mmx_4more_loop_j: 1089 movd mm0, [ecx - 16] 1090 movd mm7, [ecx - 8] 1091 punpckldq mm0, [ecx - 12] 1092 punpckldq mm7, [ecx - 4] 1093 packssdw mm0, mm7 1094 pmaddwd mm0, [edx] 1095 punpckhdq mm7, mm7 1096 paddd mm3, mm0 1097 movd mm0, [ecx - 12] 1098 punpckldq mm0, [ecx - 8] 1099 punpckldq mm7, [ecx] 1100 packssdw mm0, mm7 1101 pmaddwd mm0, [edx] 1102 paddd mm2, mm0 1103 1104 add edx, byte 8 1105 add ecx, byte 16 1106 cmp ecx, esi 1107 jnz .mmx_4more_loop_j 1108 1109 movq mm0, mm3 1110 punpckldq mm3, mm2 1111 punpckhdq mm0, mm2 1112 paddd mm3, mm0 1113 psrad mm3, mm6 1114 psubd mm1, mm3 1115 movd [edi], mm1 1116 punpckhdq mm1, mm1 1117 movd [edi + 4], mm1 1118 1119 add edi, byte 8 1120 add esi, byte 8 1121 1122 sub ebx, 2 1123 jg near .mmx_4more_loop_i 1124 1125 .mmx_end: 1126 emms 1127 mov esp, ebp 1128 .last_one: 1129 mov eax, [esp + 32] 1130 inc ebx 1131 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin 1132 1133 .end: 1134 pop edi 1135 pop esi 1136 pop ebx 1137 pop ebp 1138 ret 1139 1140 ; ********************************************************************** 1141 ; 1142 ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) 1143 ; { 1144 ; unsigned i, j; 1145 ; FLAC__int32 sum; 1146 ; 1147 ; FLAC__ASSERT(order > 0); 1148 ; 1149 ; for(i = 0; i < data_len; i++) { 1150 ; sum = 0; 1151 ; for(j = 0; j < order; j++) 1152 ; sum += qlp_coeff[j] * data[i-j-1]; 1153 ; data[i] = residual[i] + (sum >> lp_quantization); 1154 ; } 1155 ; } 1156 ALIGN 16 1157 cident FLAC__lpc_restore_signal_asm_ia32 1158 ;[esp + 40] data[] 1159 ;[esp + 36] lp_quantization 1160 ;[esp + 32] order 1161 ;[esp + 28] qlp_coeff[] 1162 ;[esp + 24] data_len 1163 ;[esp + 20] residual[] 1164 1165 ;ASSERT(order > 0) 1166 1167 push ebp 1168 push ebx 1169 push esi 1170 push edi 1171 1172 mov esi, [esp + 20] ; esi = residual[] 1173 mov edi, [esp + 40] ; edi = data[] 1174 mov eax, [esp + 32] ; eax = order 1175 mov ebx, [esp + 24] ; ebx = data_len 1176 1177 test ebx, ebx 1178 jz near .end ; do nothing if data_len == 0 1179 1180 .begin: 1181 cmp eax, byte 1 1182 jg short .x87_1more 1183 1184 mov ecx, [esp + 28] 1185 mov edx, [ecx] 1186 mov eax, [edi - 4] 1187 mov cl, [esp + 36] 1188 ALIGN 16 1189 .x87_1_loop_i: 1190 imul eax, edx 1191 sar eax, cl 1192 add eax, [esi] 1193 mov [edi], eax 1194 add esi, byte 4 1195 add edi, byte 4 1196 dec ebx 1197 jnz .x87_1_loop_i 1198 1199 jmp .end 1200 1201 .x87_1more: 1202 cmp eax, byte 32 ; for order <= 32 there is a faster routine 1203 jbe short .x87_32 1204 1205 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 1206 ALIGN 16 1207 .x87_32more_loop_i: 1208 xor ebp, ebp 1209 mov ecx, [esp + 32] 1210 mov edx, ecx 1211 shl edx, 2 1212 add edx, [esp + 28] 1213 neg ecx 1214 ALIGN 16 1215 .x87_32more_loop_j: 1216 sub edx, byte 4 1217 mov eax, [edx] 1218 imul eax, [edi + 4 * ecx] 1219 add ebp, eax 1220 inc ecx 1221 jnz short .x87_32more_loop_j 1222 1223 mov cl, [esp + 36] 1224 sar ebp, cl 1225 add ebp, [esi] 1226 mov [edi], ebp 1227 add edi, byte 4 1228 add esi, byte 4 1229 1230 dec ebx 1231 jnz .x87_32more_loop_i 1232 1233 jmp .end 1234 1235 .x87_32: 1236 sub esi, edi 1237 neg eax 1238 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] 1239 call .get_eip0 1240 .get_eip0: 1241 pop eax 1242 add edx, eax 1243 inc edx ; compensate for the shorter opcode on the last iteration 1244 mov eax, [esp + 28] ; eax = qlp_coeff[] 1245 xor ebp, ebp 1246 jmp edx 1247 1248 mov ecx, [eax + 124] ; ecx = qlp_coeff[31] 1249 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32] 1250 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32] 1251 mov ecx, [eax + 120] ; ecx = qlp_coeff[30] 1252 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31] 1253 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31] 1254 mov ecx, [eax + 116] ; ecx = qlp_coeff[29] 1255 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30] 1256 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30] 1257 mov ecx, [eax + 112] ; ecx = qlp_coeff[28] 1258 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29] 1259 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29] 1260 mov ecx, [eax + 108] ; ecx = qlp_coeff[27] 1261 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28] 1262 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28] 1263 mov ecx, [eax + 104] ; ecx = qlp_coeff[26] 1264 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27] 1265 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27] 1266 mov ecx, [eax + 100] ; ecx = qlp_coeff[25] 1267 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26] 1268 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26] 1269 mov ecx, [eax + 96] ; ecx = qlp_coeff[24] 1270 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25] 1271 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25] 1272 mov ecx, [eax + 92] ; ecx = qlp_coeff[23] 1273 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24] 1274 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24] 1275 mov ecx, [eax + 88] ; ecx = qlp_coeff[22] 1276 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23] 1277 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23] 1278 mov ecx, [eax + 84] ; ecx = qlp_coeff[21] 1279 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22] 1280 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22] 1281 mov ecx, [eax + 80] ; ecx = qlp_coeff[20] 1282 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21] 1283 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21] 1284 mov ecx, [eax + 76] ; ecx = qlp_coeff[19] 1285 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20] 1286 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20] 1287 mov ecx, [eax + 72] ; ecx = qlp_coeff[18] 1288 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19] 1289 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19] 1290 mov ecx, [eax + 68] ; ecx = qlp_coeff[17] 1291 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18] 1292 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18] 1293 mov ecx, [eax + 64] ; ecx = qlp_coeff[16] 1294 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17] 1295 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17] 1296 mov ecx, [eax + 60] ; ecx = qlp_coeff[15] 1297 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16] 1298 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16] 1299 mov ecx, [eax + 56] ; ecx = qlp_coeff[14] 1300 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15] 1301 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15] 1302 mov ecx, [eax + 52] ; ecx = qlp_coeff[13] 1303 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14] 1304 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14] 1305 mov ecx, [eax + 48] ; ecx = qlp_coeff[12] 1306 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13] 1307 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13] 1308 mov ecx, [eax + 44] ; ecx = qlp_coeff[11] 1309 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12] 1310 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12] 1311 mov ecx, [eax + 40] ; ecx = qlp_coeff[10] 1312 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11] 1313 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11] 1314 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9] 1315 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10] 1316 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10] 1317 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8] 1318 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9] 1319 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9] 1320 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7] 1321 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8] 1322 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8] 1323 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6] 1324 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7] 1325 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7] 1326 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5] 1327 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6] 1328 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6] 1329 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4] 1330 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5] 1331 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5] 1332 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3] 1333 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4] 1334 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4] 1335 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2] 1336 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3] 1337 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3] 1338 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1] 1339 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2] 1340 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2] 1341 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction) 1342 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1] 1343 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1] 1344 .jumper_0: 1345 1346 mov cl, [esp + 36] 1347 sar ebp, cl ; ebp = (sum >> lp_quantization) 1348 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization) 1349 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization) 1350 add edi, byte 4 1351 1352 dec ebx 1353 jz short .end 1354 xor ebp, ebp 1355 jmp edx 1356 1357 .end: 1358 pop edi 1359 pop esi 1360 pop ebx 1361 pop ebp 1362 ret 1363 1364 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for 1365 ; the channel and qlp_coeffs must be <= 16. Especially note that this routine 1366 ; cannot be used for side-channel coded 16bps channels since the effective bps 1367 ; is 17. 1368 ; WATCHOUT: this routine requires that each data array have a buffer of up to 1369 ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each 1370 ; channel n, data[n][-1] through data[n][-3] should be accessible and zero. 1371 ALIGN 16 1372 cident FLAC__lpc_restore_signal_asm_ia32_mmx 1373 ;[esp + 40] data[] 1374 ;[esp + 36] lp_quantization 1375 ;[esp + 32] order 1376 ;[esp + 28] qlp_coeff[] 1377 ;[esp + 24] data_len 1378 ;[esp + 20] residual[] 1379 1380 ;ASSERT(order > 0) 1381 1382 push ebp 1383 push ebx 1384 push esi 1385 push edi 1386 1387 mov esi, [esp + 20] 1388 mov edi, [esp + 40] 1389 mov eax, [esp + 32] 1390 mov ebx, [esp + 24] 1391 1392 test ebx, ebx 1393 jz near .end ; do nothing if data_len == 0 1394 cmp eax, byte 4 1395 jb near FLAC__lpc_restore_signal_asm_ia32.begin 1396 1397 mov edx, [esp + 28] 1398 movd mm6, [esp + 36] 1399 mov ebp, esp 1400 1401 and esp, 0xfffffff8 1402 1403 xor ecx, ecx 1404 .copy_qlp_loop: 1405 push word [edx + 4 * ecx] 1406 inc ecx 1407 cmp ecx, eax 1408 jnz short .copy_qlp_loop 1409 1410 and ecx, 0x3 1411 test ecx, ecx 1412 je short .za_end 1413 sub ecx, byte 4 1414 .za_loop: 1415 push word 0 1416 inc eax 1417 inc ecx 1418 jnz short .za_loop 1419 .za_end: 1420 1421 movq mm5, [esp + 2 * eax - 8] 1422 movd mm4, [edi - 16] 1423 punpckldq mm4, [edi - 12] 1424 movd mm0, [edi - 8] 1425 punpckldq mm0, [edi - 4] 1426 packssdw mm4, mm0 1427 1428 cmp eax, byte 4 1429 jnbe short .mmx_4more 1430 1431 ALIGN 16 1432 .mmx_4_loop_i: 1433 movq mm7, mm4 1434 pmaddwd mm7, mm5 1435 movq mm0, mm7 1436 punpckhdq mm7, mm7 1437 paddd mm7, mm0 1438 psrad mm7, mm6 1439 movd mm1, [esi] 1440 paddd mm7, mm1 1441 movd [edi], mm7 1442 psllq mm7, 48 1443 psrlq mm4, 16 1444 por mm4, mm7 1445 1446 add esi, byte 4 1447 add edi, byte 4 1448 1449 dec ebx 1450 jnz .mmx_4_loop_i 1451 jmp .mmx_end 1452 .mmx_4more: 1453 shl eax, 2 1454 neg eax 1455 add eax, byte 16 1456 ALIGN 16 1457 .mmx_4more_loop_i: 1458 mov ecx, edi 1459 add ecx, eax 1460 mov edx, esp 1461 1462 movq mm7, mm4 1463 pmaddwd mm7, mm5 1464 1465 ALIGN 16 1466 .mmx_4more_loop_j: 1467 movd mm0, [ecx - 16] 1468 punpckldq mm0, [ecx - 12] 1469 movd mm1, [ecx - 8] 1470 punpckldq mm1, [ecx - 4] 1471 packssdw mm0, mm1 1472 pmaddwd mm0, [edx] 1473 paddd mm7, mm0 1474 1475 add edx, byte 8 1476 add ecx, byte 16 1477 cmp ecx, edi 1478 jnz .mmx_4more_loop_j 1479 1480 movq mm0, mm7 1481 punpckhdq mm7, mm7 1482 paddd mm7, mm0 1483 psrad mm7, mm6 1484 movd mm1, [esi] 1485 paddd mm7, mm1 1486 movd [edi], mm7 1487 psllq mm7, 48 1488 psrlq mm4, 16 1489 por mm4, mm7 1490 1491 add esi, byte 4 1492 add edi, byte 4 1493 1494 dec ebx 1495 jnz short .mmx_4more_loop_i 1496 .mmx_end: 1497 emms 1498 mov esp, ebp 1499 1500 .end: 1501 pop edi 1502 pop esi 1503 pop ebx 1504 pop ebp 1505 ret 1506 1507 end 1508 1509 %ifdef OBJ_FORMAT_elf 1510 section .note.GNU-stack noalloc 1511 %endif 1512