Home | History | Annotate | Download | only in gas
      1 #  libFLAC - Free Lossless Audio Codec library
      2 #  Copyright (C) 2004,2005,2006,2007  Josh Coalson
      3 #
      4 #  Redistribution and use in source and binary forms, with or without
      5 #  modification, are permitted provided that the following conditions
      6 #  are met:
      7 #
      8 #  - Redistributions of source code must retain the above copyright
      9 #  notice, this list of conditions and the following disclaimer.
     10 #
     11 #  - Redistributions in binary form must reproduce the above copyright
     12 #  notice, this list of conditions and the following disclaimer in the
     13 #  documentation and/or other materials provided with the distribution.
     14 #
     15 #  - Neither the name of the Xiph.org Foundation nor the names of its
     16 #  contributors may be used to endorse or promote products derived from
     17 #  this software without specific prior written permission.
     18 #
     19 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 #  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 #  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 #  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
     23 #  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     24 #  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     25 #  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     26 #  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     27 #  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     28 #  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     29 #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 .text
     32 	.align 2
     33 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16
     34 .type _FLAC__lpc_restore_signal_asm_ppc_altivec_16, @function
     35 
     36 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
     37 .type _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8, @function
     38 
     39 _FLAC__lpc_restore_signal_asm_ppc_altivec_16:
     40 #	r3: residual[]
     41 #	r4: data_len
     42 #	r5: qlp_coeff[]
     43 #	r6: order
     44 #	r7: lp_quantization
     45 #	r8: data[]
     46 
     47 # see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
     48 # these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
     49 # bps<=15 for mid-side coding, since that uses an extra bit)
     50 
     51 # these should be fast; the inner loop is unrolled (it takes no more than
     52 # 3*(order%4) instructions, all of which are arithmetic), and all of the
     53 # coefficients and all relevant history stay in registers, so the outer loop
     54 # has only one load from memory (the residual)
     55 
     56 # I have not yet run this through simg4, so there may be some avoidable stalls,
     57 # and there may be a somewhat more clever way to do the outer loop
     58 
     59 # the branch mechanism may prevent dynamic loading; I still need to examine
     60 # this issue, and there may be a more elegant method
     61 
     62 	stmw r31,-4(r1)
     63 
     64 	addi r9,r1,-28
     65 	li r31,0xf
     66 	andc r9,r9,r31 # for quadword-aligned stack data
     67 
     68 	slwi r6,r6,2 # adjust for word size
     69 	slwi r4,r4,2
     70 	add r4,r4,r8 # r4 = data+data_len
     71 
     72 	mfspr r0,256 # cache old vrsave
     73 	addis r31,0,0xffff
     74 	ori r31,r31,0xfc00
     75 	mtspr 256,r31 # declare VRs in vrsave
     76 
     77 	cmplw cr0,r8,r4 # i<data_len
     78 	bc 4,0,L1400
     79 
     80 	# load coefficients into v0-v7 and initial history into v8-v15
     81 	li r31,0xf
     82 	and r31,r8,r31 # r31: data%4
     83 	li r11,16
     84 	subf r31,r31,r11 # r31: 4-(data%4)
     85 	slwi r31,r31,3 # convert to bits for vsro
     86 	li r10,-4
     87 	stw r31,-4(r9)
     88 	lvewx v0,r10,r9
     89 	vspltisb v18,-1
     90 	vsro v18,v18,v0 # v18: mask vector
     91 
     92 	li r31,0x8
     93 	lvsl v0,0,r31
     94 	vsldoi v0,v0,v0,12
     95 	li r31,0xc
     96 	lvsl v1,0,r31
     97 	vspltisb v2,0
     98 	vspltisb v3,-1
     99 	vmrglw v2,v2,v3
    100 	vsel v0,v1,v0,v2 # v0: reversal permutation vector
    101 
    102 	add r10,r5,r6
    103 	lvsl v17,0,r5 # v17: coefficient alignment permutation vector
    104 	vperm v17,v17,v17,v0 # v17: reversal coefficient alignment permutation vector
    105 
    106 	mr r11,r8
    107 	lvsl v16,0,r11 # v16: history alignment permutation vector
    108 
    109 	lvx v0,0,r5
    110 	addi r5,r5,16
    111 	lvx v1,0,r5
    112 	vperm v0,v0,v1,v17
    113 	lvx v8,0,r11
    114 	addi r11,r11,-16
    115 	lvx v9,0,r11
    116 	vperm v8,v9,v8,v16
    117 	cmplw cr0,r5,r10
    118 	bc 12,0,L1101
    119 	vand v0,v0,v18
    120 	addis r31,0,L1307@ha
    121 	ori r31,r31,L1307@l
    122 	b L1199
    123 
    124 L1101:
    125 	addi r5,r5,16
    126 	lvx v2,0,r5
    127 	vperm v1,v1,v2,v17
    128 	addi r11,r11,-16
    129 	lvx v10,0,r11
    130 	vperm v9,v10,v9,v16
    131 	cmplw cr0,r5,r10
    132 	bc 12,0,L1102
    133 	vand v1,v1,v18
    134 	addis r31,0,L1306@ha
    135 	ori r31,r31,L1306@l
    136 	b L1199
    137 
    138 L1102:
    139 	addi r5,r5,16
    140 	lvx v3,0,r5
    141 	vperm v2,v2,v3,v17
    142 	addi r11,r11,-16
    143 	lvx v11,0,r11
    144 	vperm v10,v11,v10,v16
    145 	cmplw cr0,r5,r10
    146 	bc 12,0,L1103
    147 	vand v2,v2,v18
    148 	lis r31,L1305@ha
    149 	la r31,L1305@l(r31)
    150 	b L1199
    151 
    152 L1103:
    153 	addi r5,r5,16
    154 	lvx v4,0,r5
    155 	vperm v3,v3,v4,v17
    156 	addi r11,r11,-16
    157 	lvx v12,0,r11
    158 	vperm v11,v12,v11,v16
    159 	cmplw cr0,r5,r10
    160 	bc 12,0,L1104
    161 	vand v3,v3,v18
    162 	lis r31,L1304@ha
    163 	la r31,L1304@l(r31)
    164 	b L1199
    165 
    166 L1104:
    167 	addi r5,r5,16
    168 	lvx v5,0,r5
    169 	vperm v4,v4,v5,v17
    170 	addi r11,r11,-16
    171 	lvx v13,0,r11
    172 	vperm v12,v13,v12,v16
    173 	cmplw cr0,r5,r10
    174 	bc 12,0,L1105
    175 	vand v4,v4,v18
    176 	lis r31,L1303@ha
    177 	la r31,L1303@l(r31)
    178 	b L1199
    179 
    180 L1105:
    181 	addi r5,r5,16
    182 	lvx v6,0,r5
    183 	vperm v5,v5,v6,v17
    184 	addi r11,r11,-16
    185 	lvx v14,0,r11
    186 	vperm v13,v14,v13,v16
    187 	cmplw cr0,r5,r10
    188 	bc 12,0,L1106
    189 	vand v5,v5,v18
    190 	lis r31,L1302@ha
    191 	la r31,L1302@l(r31)
    192 	b L1199
    193 
    194 L1106:
    195 	addi r5,r5,16
    196 	lvx v7,0,r5
    197 	vperm v6,v6,v7,v17
    198 	addi r11,r11,-16
    199 	lvx v15,0,r11
    200 	vperm v14,v15,v14,v16
    201 	cmplw cr0,r5,r10
    202 	bc 12,0,L1107
    203 	vand v6,v6,v18
    204 	lis r31,L1301@ha
    205 	la r31,L1301@l(r31)
    206 	b L1199
    207 
    208 L1107:
    209 	addi r5,r5,16
    210 	lvx v19,0,r5
    211 	vperm v7,v7,v19,v17
    212 	addi r11,r11,-16
    213 	lvx v19,0,r11
    214 	vperm v15,v19,v15,v16
    215 	vand v7,v7,v18
    216 	lis r31,L1300@ha
    217 	la r31,L1300@l(r31)
    218 
    219 L1199:
    220 	mtctr r31
    221 
    222 	# set up invariant vectors
    223 	vspltish v16,0 # v16: zero vector
    224 
    225 	li r10,-12
    226 	lvsr v17,r10,r8 # v17: result shift vector
    227 	lvsl v18,r10,r3 # v18: residual shift back vector
    228 
    229 	li r10,-4
    230 	stw r7,-4(r9)
    231 	lvewx v19,r10,r9 # v19: lp_quantization vector
    232 
    233 L1200:
    234 	vmulosh v20,v0,v8 # v20: sum vector
    235 	bcctr 20,0
    236 
    237 L1300:
    238 	vmulosh v21,v7,v15
    239 	vsldoi v15,v15,v14,4 # increment history
    240 	vaddsws v20,v20,v21
    241 
    242 L1301:
    243 	vmulosh v21,v6,v14
    244 	vsldoi v14,v14,v13,4
    245 	vaddsws v20,v20,v21
    246 
    247 L1302:
    248 	vmulosh v21,v5,v13
    249 	vsldoi v13,v13,v12,4
    250 	vaddsws v20,v20,v21
    251 
    252 L1303:
    253 	vmulosh v21,v4,v12
    254 	vsldoi v12,v12,v11,4
    255 	vaddsws v20,v20,v21
    256 
    257 L1304:
    258 	vmulosh v21,v3,v11
    259 	vsldoi v11,v11,v10,4
    260 	vaddsws v20,v20,v21
    261 
    262 L1305:
    263 	vmulosh v21,v2,v10
    264 	vsldoi v10,v10,v9,4
    265 	vaddsws v20,v20,v21
    266 
    267 L1306:
    268 	vmulosh v21,v1,v9
    269 	vsldoi v9,v9,v8,4
    270 	vaddsws v20,v20,v21
    271 
    272 L1307:
    273 	vsumsws v20,v20,v16 # v20[3]: sum
    274 	vsraw v20,v20,v19 # v20[3]: sum >> lp_quantization
    275 
    276 	lvewx v21,0,r3 # v21[n]: *residual
    277 	vperm v21,v21,v21,v18 # v21[3]: *residual
    278 	vaddsws v20,v21,v20 # v20[3]: *residual + (sum >> lp_quantization)
    279 	vsldoi v18,v18,v18,4 # increment shift vector
    280 
    281 	vperm v21,v20,v20,v17 # v21[n]: shift for storage
    282 	vsldoi v17,v17,v17,12 # increment shift vector
    283 	stvewx v21,0,r8
    284 
    285 	vsldoi v20,v20,v20,12
    286 	vsldoi v8,v8,v20,4 # insert value onto history
    287 
    288 	addi r3,r3,4
    289 	addi r8,r8,4
    290 	cmplw cr0,r8,r4 # i<data_len
    291 	bc 12,0,L1200
    292 
    293 L1400:
    294 	mtspr 256,r0 # restore old vrsave
    295 	lmw r31,-4(r1)
    296 	blr
    297 
    298 _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
    299 #	r3: residual[]
    300 #	r4: data_len
    301 #	r5: qlp_coeff[]
    302 #	r6: order
    303 #	r7: lp_quantization
    304 #	r8: data[]
    305 
    306 # see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
    307 # this version assumes order<=8; it uses fewer vector registers, which should
    308 # save time in context switches, and has less code, which may improve
    309 # instruction caching
    310 
    311 	stmw r31,-4(r1)
    312 
    313 	addi r9,r1,-28
    314 	li r31,0xf
    315 	andc r9,r9,r31 # for quadword-aligned stack data
    316 
    317 	slwi r6,r6,2 # adjust for word size
    318 	slwi r4,r4,2
    319 	add r4,r4,r8 # r4 = data+data_len
    320 
    321 	mfspr r0,256 # cache old vrsave
    322 	addis r31,0,0xffc0
    323 	ori r31,r31,0x0000
    324 	mtspr 256,r31 # declare VRs in vrsave
    325 
    326 	cmplw cr0,r8,r4 # i<data_len
    327 	bc 4,0,L2400
    328 
    329 	# load coefficients into v0-v1 and initial history into v2-v3
    330 	li r31,0xf
    331 	and r31,r8,r31 # r31: data%4
    332 	li r11,16
    333 	subf r31,r31,r11 # r31: 4-(data%4)
    334 	slwi r31,r31,3 # convert to bits for vsro
    335 	li r10,-4
    336 	stw r31,-4(r9)
    337 	lvewx v0,r10,r9
    338 	vspltisb v6,-1
    339 	vsro v6,v6,v0 # v6: mask vector
    340 
    341 	li r31,0x8
    342 	lvsl v0,0,r31
    343 	vsldoi v0,v0,v0,12
    344 	li r31,0xc
    345 	lvsl v1,0,r31
    346 	vspltisb v2,0
    347 	vspltisb v3,-1
    348 	vmrglw v2,v2,v3
    349 	vsel v0,v1,v0,v2 # v0: reversal permutation vector
    350 
    351 	add r10,r5,r6
    352 	lvsl v5,0,r5 # v5: coefficient alignment permutation vector
    353 	vperm v5,v5,v5,v0 # v5: reversal coefficient alignment permutation vector
    354 
    355 	mr r11,r8
    356 	lvsl v4,0,r11 # v4: history alignment permutation vector
    357 
    358 	lvx v0,0,r5
    359 	addi r5,r5,16
    360 	lvx v1,0,r5
    361 	vperm v0,v0,v1,v5
    362 	lvx v2,0,r11
    363 	addi r11,r11,-16
    364 	lvx v3,0,r11
    365 	vperm v2,v3,v2,v4
    366 	cmplw cr0,r5,r10
    367 	bc 12,0,L2101
    368 	vand v0,v0,v6
    369 	lis r31,L2301@ha
    370 	la r31,L2301@l(r31)
    371 	b L2199
    372 
    373 L2101:
    374 	addi r5,r5,16
    375 	lvx v7,0,r5
    376 	vperm v1,v1,v7,v5
    377 	addi r11,r11,-16
    378 	lvx v7,0,r11
    379 	vperm v3,v7,v3,v4
    380 	vand v1,v1,v6
    381 	lis r31,L2300@ha
    382 	la r31,L2300@l(r31)
    383 
    384 L2199:
    385 	mtctr r31
    386 
    387 	# set up invariant vectors
    388 	vspltish v4,0 # v4: zero vector
    389 
    390 	li r10,-12
    391 	lvsr v5,r10,r8 # v5: result shift vector
    392 	lvsl v6,r10,r3 # v6: residual shift back vector
    393 
    394 	li r10,-4
    395 	stw r7,-4(r9)
    396 	lvewx v7,r10,r9 # v7: lp_quantization vector
    397 
    398 L2200:
    399 	vmulosh v8,v0,v2 # v8: sum vector
    400 	bcctr 20,0
    401 
    402 L2300:
    403 	vmulosh v9,v1,v3
    404 	vsldoi v3,v3,v2,4
    405 	vaddsws v8,v8,v9
    406 
    407 L2301:
    408 	vsumsws v8,v8,v4 # v8[3]: sum
    409 	vsraw v8,v8,v7 # v8[3]: sum >> lp_quantization
    410 
    411 	lvewx v9,0,r3 # v9[n]: *residual
    412 	vperm v9,v9,v9,v6 # v9[3]: *residual
    413 	vaddsws v8,v9,v8 # v8[3]: *residual + (sum >> lp_quantization)
    414 	vsldoi v6,v6,v6,4 # increment shift vector
    415 
    416 	vperm v9,v8,v8,v5 # v9[n]: shift for storage
    417 	vsldoi v5,v5,v5,12 # increment shift vector
    418 	stvewx v9,0,r8
    419 
    420 	vsldoi v8,v8,v8,12
    421 	vsldoi v2,v2,v8,4 # insert value onto history
    422 
    423 	addi r3,r3,4
    424 	addi r8,r8,4
    425 	cmplw cr0,r8,r4 # i<data_len
    426 	bc 12,0,L2200
    427 
    428 L2400:
    429 	mtspr 256,r0 # restore old vrsave
    430 	lmw r31,-4(r1)
    431 	blr
    432