Home | History | Annotate | Download | only in as
      1 ;  libFLAC - Free Lossless Audio Codec library
      2 ;  Copyright (C) 2004,2005,2006,2007  Josh Coalson
      3 ;
      4 ;  Redistribution and use in source and binary forms, with or without
      5 ;  modification, are permitted provided that the following conditions
      6 ;  are met:
      7 ;
      8 ;  - Redistributions of source code must retain the above copyright
      9 ;  notice, this list of conditions and the following disclaimer.
     10 ;
     11 ;  - Redistributions in binary form must reproduce the above copyright
     12 ;  notice, this list of conditions and the following disclaimer in the
     13 ;  documentation and/or other materials provided with the distribution.
     14 ;
     15 ;  - Neither the name of the Xiph.org Foundation nor the names of its
     16 ;  contributors may be used to endorse or promote products derived from
     17 ;  this software without specific prior written permission.
     18 ;
     19 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 ;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 ;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
     23 ;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     24 ;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     25 ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     26 ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     27 ;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     28 ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     29 ;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 .text
     32 	.align 2
     33 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16
     34 
     35 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
     36 
     37 _FLAC__lpc_restore_signal_asm_ppc_altivec_16:
     38 ;	r3: residual[]
     39 ;	r4: data_len
     40 ;	r5: qlp_coeff[]
     41 ;	r6: order
     42 ;	r7: lp_quantization
     43 ;	r8: data[]
     44 
     45 ; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
     46 ; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
     47 ; bps<=15 for mid-side coding, since that uses an extra bit)
     48 
     49 ; these should be fast; the inner loop is unrolled (it takes no more than
     50 ; 3*(order%4) instructions, all of which are arithmetic), and all of the
     51 ; coefficients and all relevant history stay in registers, so the outer loop
     52 ; has only one load from memory (the residual)
     53 
     54 ; I have not yet run this through simg4, so there may be some avoidable stalls,
     55 ; and there may be a somewhat more clever way to do the outer loop
     56 
     57 ; the branch mechanism may prevent dynamic loading; I still need to examine
     58 ; this issue, and there may be a more elegant method
     59 
     60 	stmw r31,-4(r1)
     61 
     62 	addi r9,r1,-28
     63 	li r31,0xf
     64 	andc r9,r9,r31 ; for quadword-aligned stack data
     65 
     66 	slwi r6,r6,2 ; adjust for word size
     67 	slwi r4,r4,2
     68 	add r4,r4,r8 ; r4 = data+data_len
     69 
     70 	mfspr r0,256 ; cache old vrsave
     71 	addis r31,0,hi16(0xfffffc00)
     72 	ori r31,r31,lo16(0xfffffc00)
     73 	mtspr 256,r31 ; declare VRs in vrsave
     74 
     75 	cmplw cr0,r8,r4 ; i<data_len
     76 	bc 4,0,L1400
     77 
     78 	; load coefficients into v0-v7 and initial history into v8-v15
     79 	li r31,0xf
     80 	and r31,r8,r31 ; r31: data%4
     81 	li r11,16
     82 	subf r31,r31,r11 ; r31: 4-(data%4)
     83 	slwi r31,r31,3 ; convert to bits for vsro
     84 	li r10,-4
     85 	stw r31,-4(r9)
     86 	lvewx v0,r10,r9
     87 	vspltisb v18,-1
     88 	vsro v18,v18,v0 ; v18: mask vector
     89 
     90 	li r31,0x8
     91 	lvsl v0,0,r31
     92 	vsldoi v0,v0,v0,12
     93 	li r31,0xc
     94 	lvsl v1,0,r31
     95 	vspltisb v2,0
     96 	vspltisb v3,-1
     97 	vmrglw v2,v2,v3
     98 	vsel v0,v1,v0,v2 ; v0: reversal permutation vector
     99 
    100 	add r10,r5,r6
    101 	lvsl v17,0,r5 ; v17: coefficient alignment permutation vector
    102 	vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector
    103 
    104 	mr r11,r8
    105 	lvsl v16,0,r11 ; v16: history alignment permutation vector
    106 
    107 	lvx v0,0,r5
    108 	addi r5,r5,16
    109 	lvx v1,0,r5
    110 	vperm v0,v0,v1,v17
    111 	lvx v8,0,r11
    112 	addi r11,r11,-16
    113 	lvx v9,0,r11
    114 	vperm v8,v9,v8,v16
    115 	cmplw cr0,r5,r10
    116 	bc 12,0,L1101
    117 	vand v0,v0,v18
    118 	addis r31,0,hi16(L1307)
    119 	ori r31,r31,lo16(L1307)
    120 	b L1199
    121 
    122 L1101:
    123 	addi r5,r5,16
    124 	lvx v2,0,r5
    125 	vperm v1,v1,v2,v17
    126 	addi r11,r11,-16
    127 	lvx v10,0,r11
    128 	vperm v9,v10,v9,v16
    129 	cmplw cr0,r5,r10
    130 	bc 12,0,L1102
    131 	vand v1,v1,v18
    132 	addis r31,0,hi16(L1306)
    133 	ori r31,r31,lo16(L1306)
    134 	b L1199
    135 
    136 L1102:
    137 	addi r5,r5,16
    138 	lvx v3,0,r5
    139 	vperm v2,v2,v3,v17
    140 	addi r11,r11,-16
    141 	lvx v11,0,r11
    142 	vperm v10,v11,v10,v16
    143 	cmplw cr0,r5,r10
    144 	bc 12,0,L1103
    145 	vand v2,v2,v18
    146 	addis r31,0,hi16(L1305)
    147 	ori r31,r31,lo16(L1305)
    148 	b L1199
    149 
    150 L1103:
    151 	addi r5,r5,16
    152 	lvx v4,0,r5
    153 	vperm v3,v3,v4,v17
    154 	addi r11,r11,-16
    155 	lvx v12,0,r11
    156 	vperm v11,v12,v11,v16
    157 	cmplw cr0,r5,r10
    158 	bc 12,0,L1104
    159 	vand v3,v3,v18
    160 	addis r31,0,hi16(L1304)
    161 	ori r31,r31,lo16(L1304)
    162 	b L1199
    163 
    164 L1104:
    165 	addi r5,r5,16
    166 	lvx v5,0,r5
    167 	vperm v4,v4,v5,v17
    168 	addi r11,r11,-16
    169 	lvx v13,0,r11
    170 	vperm v12,v13,v12,v16
    171 	cmplw cr0,r5,r10
    172 	bc 12,0,L1105
    173 	vand v4,v4,v18
    174 	addis r31,0,hi16(L1303)
    175 	ori r31,r31,lo16(L1303)
    176 	b L1199
    177 
    178 L1105:
    179 	addi r5,r5,16
    180 	lvx v6,0,r5
    181 	vperm v5,v5,v6,v17
    182 	addi r11,r11,-16
    183 	lvx v14,0,r11
    184 	vperm v13,v14,v13,v16
    185 	cmplw cr0,r5,r10
    186 	bc 12,0,L1106
    187 	vand v5,v5,v18
    188 	addis r31,0,hi16(L1302)
    189 	ori r31,r31,lo16(L1302)
    190 	b L1199
    191 
    192 L1106:
    193 	addi r5,r5,16
    194 	lvx v7,0,r5
    195 	vperm v6,v6,v7,v17
    196 	addi r11,r11,-16
    197 	lvx v15,0,r11
    198 	vperm v14,v15,v14,v16
    199 	cmplw cr0,r5,r10
    200 	bc 12,0,L1107
    201 	vand v6,v6,v18
    202 	addis r31,0,hi16(L1301)
    203 	ori r31,r31,lo16(L1301)
    204 	b L1199
    205 
    206 L1107:
    207 	addi r5,r5,16
    208 	lvx v19,0,r5
    209 	vperm v7,v7,v19,v17
    210 	addi r11,r11,-16
    211 	lvx v19,0,r11
    212 	vperm v15,v19,v15,v16
    213 	vand v7,v7,v18
    214 	addis r31,0,hi16(L1300)
    215 	ori r31,r31,lo16(L1300)
    216 
    217 L1199:
    218 	mtctr r31
    219 
    220 	; set up invariant vectors
    221 	vspltish v16,0 ; v16: zero vector
    222 
    223 	li r10,-12
    224 	lvsr v17,r10,r8 ; v17: result shift vector
    225 	lvsl v18,r10,r3 ; v18: residual shift back vector
    226 
    227 	li r10,-4
    228 	stw r7,-4(r9)
    229 	lvewx v19,r10,r9 ; v19: lp_quantization vector
    230 
    231 L1200:
    232 	vmulosh v20,v0,v8 ; v20: sum vector
    233 	bcctr 20,0
    234 
    235 L1300:
    236 	vmulosh v21,v7,v15
    237 	vsldoi v15,v15,v14,4 ; increment history
    238 	vaddsws v20,v20,v21
    239 
    240 L1301:
    241 	vmulosh v21,v6,v14
    242 	vsldoi v14,v14,v13,4
    243 	vaddsws v20,v20,v21
    244 
    245 L1302:
    246 	vmulosh v21,v5,v13
    247 	vsldoi v13,v13,v12,4
    248 	vaddsws v20,v20,v21
    249 
    250 L1303:
    251 	vmulosh v21,v4,v12
    252 	vsldoi v12,v12,v11,4
    253 	vaddsws v20,v20,v21
    254 
    255 L1304:
    256 	vmulosh v21,v3,v11
    257 	vsldoi v11,v11,v10,4
    258 	vaddsws v20,v20,v21
    259 
    260 L1305:
    261 	vmulosh v21,v2,v10
    262 	vsldoi v10,v10,v9,4
    263 	vaddsws v20,v20,v21
    264 
    265 L1306:
    266 	vmulosh v21,v1,v9
    267 	vsldoi v9,v9,v8,4
    268 	vaddsws v20,v20,v21
    269 
    270 L1307:
    271 	vsumsws v20,v20,v16 ; v20[3]: sum
    272 	vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization
    273 
    274 	lvewx v21,0,r3 ; v21[n]: *residual
    275 	vperm v21,v21,v21,v18 ; v21[3]: *residual
    276 	vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization)
    277 	vsldoi v18,v18,v18,4 ; increment shift vector
    278 
    279 	vperm v21,v20,v20,v17 ; v21[n]: shift for storage
    280 	vsldoi v17,v17,v17,12 ; increment shift vector
    281 	stvewx v21,0,r8
    282 
    283 	vsldoi v20,v20,v20,12
    284 	vsldoi v8,v8,v20,4 ; insert value onto history
    285 
    286 	addi r3,r3,4
    287 	addi r8,r8,4
    288 	cmplw cr0,r8,r4 ; i<data_len
    289 	bc 12,0,L1200
    290 
    291 L1400:
    292 	mtspr 256,r0 ; restore old vrsave
    293 	lmw r31,-4(r1)
    294 	blr
    295 
    296 _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
    297 ;	r3: residual[]
    298 ;	r4: data_len
    299 ;	r5: qlp_coeff[]
    300 ;	r6: order
    301 ;	r7: lp_quantization
    302 ;	r8: data[]
    303 
    304 ; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
    305 ; this version assumes order<=8; it uses fewer vector registers, which should
    306 ; save time in context switches, and has less code, which may improve
    307 ; instruction caching
    308 
    309 	stmw r31,-4(r1)
    310 
    311 	addi r9,r1,-28
    312 	li r31,0xf
    313 	andc r9,r9,r31 ; for quadword-aligned stack data
    314 
    315 	slwi r6,r6,2 ; adjust for word size
    316 	slwi r4,r4,2
    317 	add r4,r4,r8 ; r4 = data+data_len
    318 
    319 	mfspr r0,256 ; cache old vrsave
    320 	addis r31,0,hi16(0xffc00000)
    321 	ori r31,r31,lo16(0xffc00000)
    322 	mtspr 256,r31 ; declare VRs in vrsave
    323 
    324 	cmplw cr0,r8,r4 ; i<data_len
    325 	bc 4,0,L2400
    326 
    327 	; load coefficients into v0-v1 and initial history into v2-v3
    328 	li r31,0xf
    329 	and r31,r8,r31 ; r31: data%4
    330 	li r11,16
    331 	subf r31,r31,r11 ; r31: 4-(data%4)
    332 	slwi r31,r31,3 ; convert to bits for vsro
    333 	li r10,-4
    334 	stw r31,-4(r9)
    335 	lvewx v0,r10,r9
    336 	vspltisb v6,-1
    337 	vsro v6,v6,v0 ; v6: mask vector
    338 
    339 	li r31,0x8
    340 	lvsl v0,0,r31
    341 	vsldoi v0,v0,v0,12
    342 	li r31,0xc
    343 	lvsl v1,0,r31
    344 	vspltisb v2,0
    345 	vspltisb v3,-1
    346 	vmrglw v2,v2,v3
    347 	vsel v0,v1,v0,v2 ; v0: reversal permutation vector
    348 
    349 	add r10,r5,r6
    350 	lvsl v5,0,r5 ; v5: coefficient alignment permutation vector
    351 	vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector
    352 
    353 	mr r11,r8
    354 	lvsl v4,0,r11 ; v4: history alignment permutation vector
    355 
    356 	lvx v0,0,r5
    357 	addi r5,r5,16
    358 	lvx v1,0,r5
    359 	vperm v0,v0,v1,v5
    360 	lvx v2,0,r11
    361 	addi r11,r11,-16
    362 	lvx v3,0,r11
    363 	vperm v2,v3,v2,v4
    364 	cmplw cr0,r5,r10
    365 	bc 12,0,L2101
    366 	vand v0,v0,v6
    367 	addis r31,0,hi16(L2301)
    368 	ori r31,r31,lo16(L2301)
    369 	b L2199
    370 
    371 L2101:
    372 	addi r5,r5,16
    373 	lvx v7,0,r5
    374 	vperm v1,v1,v7,v5
    375 	addi r11,r11,-16
    376 	lvx v7,0,r11
    377 	vperm v3,v7,v3,v4
    378 	vand v1,v1,v6
    379 	addis r31,0,hi16(L2300)
    380 	ori r31,r31,lo16(L2300)
    381 
    382 L2199:
    383 	mtctr r31
    384 
    385 	; set up invariant vectors
    386 	vspltish v4,0 ; v4: zero vector
    387 
    388 	li r10,-12
    389 	lvsr v5,r10,r8 ; v5: result shift vector
    390 	lvsl v6,r10,r3 ; v6: residual shift back vector
    391 
    392 	li r10,-4
    393 	stw r7,-4(r9)
    394 	lvewx v7,r10,r9 ; v7: lp_quantization vector
    395 
    396 L2200:
    397 	vmulosh v8,v0,v2 ; v8: sum vector
    398 	bcctr 20,0
    399 
    400 L2300:
    401 	vmulosh v9,v1,v3
    402 	vsldoi v3,v3,v2,4
    403 	vaddsws v8,v8,v9
    404 
    405 L2301:
    406 	vsumsws v8,v8,v4 ; v8[3]: sum
    407 	vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization
    408 
    409 	lvewx v9,0,r3 ; v9[n]: *residual
    410 	vperm v9,v9,v9,v6 ; v9[3]: *residual
    411 	vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization)
    412 	vsldoi v6,v6,v6,4 ; increment shift vector
    413 
    414 	vperm v9,v8,v8,v5 ; v9[n]: shift for storage
    415 	vsldoi v5,v5,v5,12 ; increment shift vector
    416 	stvewx v9,0,r8
    417 
    418 	vsldoi v8,v8,v8,12
    419 	vsldoi v2,v2,v8,4 ; insert value onto history
    420 
    421 	addi r3,r3,4
    422 	addi r8,r8,4
    423 	cmplw cr0,r8,r4 ; i<data_len
    424 	bc 12,0,L2200
    425 
    426 L2400:
    427 	mtspr 256,r0 ; restore old vrsave
    428 	lmw r31,-4(r1)
    429 	blr
    430