1 # libFLAC - Free Lossless Audio Codec library 2 # Copyright (C) 2004,2005,2006,2007 Josh Coalson 3 # 4 # Redistribution and use in source and binary forms, with or without 5 # modification, are permitted provided that the following conditions 6 # are met: 7 # 8 # - Redistributions of source code must retain the above copyright 9 # notice, this list of conditions and the following disclaimer. 10 # 11 # - Redistributions in binary form must reproduce the above copyright 12 # notice, this list of conditions and the following disclaimer in the 13 # documentation and/or other materials provided with the distribution. 14 # 15 # - Neither the name of the Xiph.org Foundation nor the names of its 16 # contributors may be used to endorse or promote products derived from 17 # this software without specific prior written permission. 18 # 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 .text 32 .align 2 33 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16 34 .type _FLAC__lpc_restore_signal_asm_ppc_altivec_16, @function 35 36 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8 37 .type _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8, @function 38 39 _FLAC__lpc_restore_signal_asm_ppc_altivec_16: 40 # r3: residual[] 41 # r4: data_len 42 # r5: qlp_coeff[] 43 # r6: order 44 # r7: lp_quantization 45 # r8: data[] 46 47 # see src/libFLAC/lpc.c:FLAC__lpc_restore_signal() 48 # these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual 49 # bps<=15 for mid-side coding, since that uses an extra bit) 50 51 # these should be fast; the inner loop is unrolled (it takes no more than 52 # 3*(order%4) instructions, all of which are arithmetic), and all of the 53 # coefficients and all relevant history stay in registers, so the outer loop 54 # has only one load from memory (the residual) 55 56 # I have not yet run this through simg4, so there may be some avoidable stalls, 57 # and there may be a somewhat more clever way to do the outer loop 58 59 # the branch mechanism may prevent dynamic loading; I still need to examine 60 # this issue, and there may be a more elegant method 61 62 stmw r31,-4(r1) 63 64 addi r9,r1,-28 65 li r31,0xf 66 andc r9,r9,r31 # for quadword-aligned stack data 67 68 slwi r6,r6,2 # adjust for word size 69 slwi r4,r4,2 70 add r4,r4,r8 # r4 = data+data_len 71 72 mfspr r0,256 # cache old vrsave 73 addis r31,0,0xffff 74 ori r31,r31,0xfc00 75 mtspr 256,r31 # declare VRs in vrsave 76 77 cmplw cr0,r8,r4 # i<data_len 78 bc 4,0,L1400 79 80 # load coefficients into v0-v7 and initial history into v8-v15 81 li r31,0xf 82 and r31,r8,r31 # r31: data%4 83 li r11,16 84 subf r31,r31,r11 # r31: 4-(data%4) 85 slwi r31,r31,3 # convert to bits for vsro 86 li r10,-4 87 stw r31,-4(r9) 88 lvewx v0,r10,r9 89 vspltisb v18,-1 90 vsro v18,v18,v0 # v18: mask vector 91 92 li r31,0x8 93 lvsl v0,0,r31 94 vsldoi v0,v0,v0,12 95 li r31,0xc 96 lvsl v1,0,r31 97 vspltisb v2,0 98 vspltisb v3,-1 99 vmrglw v2,v2,v3 100 vsel v0,v1,v0,v2 # v0: reversal permutation vector 101 102 add r10,r5,r6 103 lvsl v17,0,r5 # v17: coefficient alignment permutation vector 104 vperm v17,v17,v17,v0 # v17: reversal coefficient alignment permutation vector 105 106 mr r11,r8 107 lvsl v16,0,r11 # v16: history alignment permutation vector 108 109 lvx v0,0,r5 110 addi r5,r5,16 111 lvx v1,0,r5 112 vperm v0,v0,v1,v17 113 lvx v8,0,r11 114 addi r11,r11,-16 115 lvx v9,0,r11 116 vperm v8,v9,v8,v16 117 cmplw cr0,r5,r10 118 bc 12,0,L1101 119 vand v0,v0,v18 120 addis r31,0,L1307@ha 121 ori r31,r31,L1307@l 122 b L1199 123 124 L1101: 125 addi r5,r5,16 126 lvx v2,0,r5 127 vperm v1,v1,v2,v17 128 addi r11,r11,-16 129 lvx v10,0,r11 130 vperm v9,v10,v9,v16 131 cmplw cr0,r5,r10 132 bc 12,0,L1102 133 vand v1,v1,v18 134 addis r31,0,L1306@ha 135 ori r31,r31,L1306@l 136 b L1199 137 138 L1102: 139 addi r5,r5,16 140 lvx v3,0,r5 141 vperm v2,v2,v3,v17 142 addi r11,r11,-16 143 lvx v11,0,r11 144 vperm v10,v11,v10,v16 145 cmplw cr0,r5,r10 146 bc 12,0,L1103 147 vand v2,v2,v18 148 lis r31,L1305@ha 149 la r31,L1305@l(r31) 150 b L1199 151 152 L1103: 153 addi r5,r5,16 154 lvx v4,0,r5 155 vperm v3,v3,v4,v17 156 addi r11,r11,-16 157 lvx v12,0,r11 158 vperm v11,v12,v11,v16 159 cmplw cr0,r5,r10 160 bc 12,0,L1104 161 vand v3,v3,v18 162 lis r31,L1304@ha 163 la r31,L1304@l(r31) 164 b L1199 165 166 L1104: 167 addi r5,r5,16 168 lvx v5,0,r5 169 vperm v4,v4,v5,v17 170 addi r11,r11,-16 171 lvx v13,0,r11 172 vperm v12,v13,v12,v16 173 cmplw cr0,r5,r10 174 bc 12,0,L1105 175 vand v4,v4,v18 176 lis r31,L1303@ha 177 la r31,L1303@l(r31) 178 b L1199 179 180 L1105: 181 addi r5,r5,16 182 lvx v6,0,r5 183 vperm v5,v5,v6,v17 184 addi r11,r11,-16 185 lvx v14,0,r11 186 vperm v13,v14,v13,v16 187 cmplw cr0,r5,r10 188 bc 12,0,L1106 189 vand v5,v5,v18 190 lis r31,L1302@ha 191 la r31,L1302@l(r31) 192 b L1199 193 194 L1106: 195 addi r5,r5,16 196 lvx v7,0,r5 197 vperm v6,v6,v7,v17 198 addi r11,r11,-16 199 lvx v15,0,r11 200 vperm v14,v15,v14,v16 201 cmplw cr0,r5,r10 202 bc 12,0,L1107 203 vand v6,v6,v18 204 lis r31,L1301@ha 205 la r31,L1301@l(r31) 206 b L1199 207 208 L1107: 209 addi r5,r5,16 210 lvx v19,0,r5 211 vperm v7,v7,v19,v17 212 addi r11,r11,-16 213 lvx v19,0,r11 214 vperm v15,v19,v15,v16 215 vand v7,v7,v18 216 lis r31,L1300@ha 217 la r31,L1300@l(r31) 218 219 L1199: 220 mtctr r31 221 222 # set up invariant vectors 223 vspltish v16,0 # v16: zero vector 224 225 li r10,-12 226 lvsr v17,r10,r8 # v17: result shift vector 227 lvsl v18,r10,r3 # v18: residual shift back vector 228 229 li r10,-4 230 stw r7,-4(r9) 231 lvewx v19,r10,r9 # v19: lp_quantization vector 232 233 L1200: 234 vmulosh v20,v0,v8 # v20: sum vector 235 bcctr 20,0 236 237 L1300: 238 vmulosh v21,v7,v15 239 vsldoi v15,v15,v14,4 # increment history 240 vaddsws v20,v20,v21 241 242 L1301: 243 vmulosh v21,v6,v14 244 vsldoi v14,v14,v13,4 245 vaddsws v20,v20,v21 246 247 L1302: 248 vmulosh v21,v5,v13 249 vsldoi v13,v13,v12,4 250 vaddsws v20,v20,v21 251 252 L1303: 253 vmulosh v21,v4,v12 254 vsldoi v12,v12,v11,4 255 vaddsws v20,v20,v21 256 257 L1304: 258 vmulosh v21,v3,v11 259 vsldoi v11,v11,v10,4 260 vaddsws v20,v20,v21 261 262 L1305: 263 vmulosh v21,v2,v10 264 vsldoi v10,v10,v9,4 265 vaddsws v20,v20,v21 266 267 L1306: 268 vmulosh v21,v1,v9 269 vsldoi v9,v9,v8,4 270 vaddsws v20,v20,v21 271 272 L1307: 273 vsumsws v20,v20,v16 # v20[3]: sum 274 vsraw v20,v20,v19 # v20[3]: sum >> lp_quantization 275 276 lvewx v21,0,r3 # v21[n]: *residual 277 vperm v21,v21,v21,v18 # v21[3]: *residual 278 vaddsws v20,v21,v20 # v20[3]: *residual + (sum >> lp_quantization) 279 vsldoi v18,v18,v18,4 # increment shift vector 280 281 vperm v21,v20,v20,v17 # v21[n]: shift for storage 282 vsldoi v17,v17,v17,12 # increment shift vector 283 stvewx v21,0,r8 284 285 vsldoi v20,v20,v20,12 286 vsldoi v8,v8,v20,4 # insert value onto history 287 288 addi r3,r3,4 289 addi r8,r8,4 290 cmplw cr0,r8,r4 # i<data_len 291 bc 12,0,L1200 292 293 L1400: 294 mtspr 256,r0 # restore old vrsave 295 lmw r31,-4(r1) 296 blr 297 298 _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8: 299 # r3: residual[] 300 # r4: data_len 301 # r5: qlp_coeff[] 302 # r6: order 303 # r7: lp_quantization 304 # r8: data[] 305 306 # see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above 307 # this version assumes order<=8; it uses fewer vector registers, which should 308 # save time in context switches, and has less code, which may improve 309 # instruction caching 310 311 stmw r31,-4(r1) 312 313 addi r9,r1,-28 314 li r31,0xf 315 andc r9,r9,r31 # for quadword-aligned stack data 316 317 slwi r6,r6,2 # adjust for word size 318 slwi r4,r4,2 319 add r4,r4,r8 # r4 = data+data_len 320 321 mfspr r0,256 # cache old vrsave 322 addis r31,0,0xffc0 323 ori r31,r31,0x0000 324 mtspr 256,r31 # declare VRs in vrsave 325 326 cmplw cr0,r8,r4 # i<data_len 327 bc 4,0,L2400 328 329 # load coefficients into v0-v1 and initial history into v2-v3 330 li r31,0xf 331 and r31,r8,r31 # r31: data%4 332 li r11,16 333 subf r31,r31,r11 # r31: 4-(data%4) 334 slwi r31,r31,3 # convert to bits for vsro 335 li r10,-4 336 stw r31,-4(r9) 337 lvewx v0,r10,r9 338 vspltisb v6,-1 339 vsro v6,v6,v0 # v6: mask vector 340 341 li r31,0x8 342 lvsl v0,0,r31 343 vsldoi v0,v0,v0,12 344 li r31,0xc 345 lvsl v1,0,r31 346 vspltisb v2,0 347 vspltisb v3,-1 348 vmrglw v2,v2,v3 349 vsel v0,v1,v0,v2 # v0: reversal permutation vector 350 351 add r10,r5,r6 352 lvsl v5,0,r5 # v5: coefficient alignment permutation vector 353 vperm v5,v5,v5,v0 # v5: reversal coefficient alignment permutation vector 354 355 mr r11,r8 356 lvsl v4,0,r11 # v4: history alignment permutation vector 357 358 lvx v0,0,r5 359 addi r5,r5,16 360 lvx v1,0,r5 361 vperm v0,v0,v1,v5 362 lvx v2,0,r11 363 addi r11,r11,-16 364 lvx v3,0,r11 365 vperm v2,v3,v2,v4 366 cmplw cr0,r5,r10 367 bc 12,0,L2101 368 vand v0,v0,v6 369 lis r31,L2301@ha 370 la r31,L2301@l(r31) 371 b L2199 372 373 L2101: 374 addi r5,r5,16 375 lvx v7,0,r5 376 vperm v1,v1,v7,v5 377 addi r11,r11,-16 378 lvx v7,0,r11 379 vperm v3,v7,v3,v4 380 vand v1,v1,v6 381 lis r31,L2300@ha 382 la r31,L2300@l(r31) 383 384 L2199: 385 mtctr r31 386 387 # set up invariant vectors 388 vspltish v4,0 # v4: zero vector 389 390 li r10,-12 391 lvsr v5,r10,r8 # v5: result shift vector 392 lvsl v6,r10,r3 # v6: residual shift back vector 393 394 li r10,-4 395 stw r7,-4(r9) 396 lvewx v7,r10,r9 # v7: lp_quantization vector 397 398 L2200: 399 vmulosh v8,v0,v2 # v8: sum vector 400 bcctr 20,0 401 402 L2300: 403 vmulosh v9,v1,v3 404 vsldoi v3,v3,v2,4 405 vaddsws v8,v8,v9 406 407 L2301: 408 vsumsws v8,v8,v4 # v8[3]: sum 409 vsraw v8,v8,v7 # v8[3]: sum >> lp_quantization 410 411 lvewx v9,0,r3 # v9[n]: *residual 412 vperm v9,v9,v9,v6 # v9[3]: *residual 413 vaddsws v8,v9,v8 # v8[3]: *residual + (sum >> lp_quantization) 414 vsldoi v6,v6,v6,4 # increment shift vector 415 416 vperm v9,v8,v8,v5 # v9[n]: shift for storage 417 vsldoi v5,v5,v5,12 # increment shift vector 418 stvewx v9,0,r8 419 420 vsldoi v8,v8,v8,12 421 vsldoi v2,v2,v8,4 # insert value onto history 422 423 addi r3,r3,4 424 addi r8,r8,4 425 cmplw cr0,r8,r4 # i<data_len 426 bc 12,0,L2200 427 428 L2400: 429 mtspr 256,r0 # restore old vrsave 430 lmw r31,-4(r1) 431 blr 432