Home | History | Annotate | Download | only in fec
      1 /* Intel SIMD SSE implementation of Viterbi ACS butterflies
      2    for 256-state (k=9) convolutional code
      3    Copyright 2004 Phil Karn, KA9Q
      4    This code may be used under the terms of the GNU Lesser General Public License (LGPL)
      5 
      6    void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits);
      7 */
      8 	# SSE (64-bit integer SIMD) version
      9 	# Requires Pentium III or better
     10 	# These are offsets into struct v29, defined in viterbi29.h
     11 	.set DP,512
     12 	.set OLDMETRICS,516
     13 	.set NEWMETRICS,520
     14 	.text
     15 	.global update_viterbi29_blk_sse,Branchtab29_sse
     16 	.type update_viterbi29_blk_sse,@function
     17 	.align 16
     18 
     19 update_viterbi29_blk_sse:
     20 	pushl %ebp
     21 	movl %esp,%ebp
     22 	pushl %esi
     23 	pushl %edi
     24 	pushl %edx
     25 	pushl %ebx
     26 
     27 	movl 8(%ebp),%edx	# edx = vp
     28 	testl %edx,%edx
     29 	jnz  0f
     30 	movl -1,%eax
     31 	jmp  err
     32 0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
     33 	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
     34 	movl DP(%edx),%edx	# edx -> decisions
     35 
     36 1:	movl 16(%ebp),%eax	# eax = nbits
     37 	decl %eax
     38 	jl   2f			# passed zero, we're done
     39 	movl %eax,16(%ebp)
     40 
     41 	xorl %eax,%eax
     42 	movl 12(%ebp),%ebx	# ebx = syms
     43 	movb (%ebx),%al
     44 	movd %eax,%mm6		# mm6[0] = first symbol
     45 	movb 1(%ebx),%al
     46 	movd %eax,%mm5		# mm5[0] = second symbol
     47 	addl $2,%ebx
     48 	movl %ebx,12(%ebp)
     49 
     50 	punpcklbw %mm6,%mm6	# mm6[1] = mm6[0]
     51 	punpcklbw %mm5,%mm5
     52 
     53 	movq thirtyones,%mm7
     54 	pshufw $0,%mm6,%mm6	# copy low word to upper 3
     55 	pshufw $0,%mm5,%mm5
     56 	# mm6 now contains first symbol in each byte, mm5 the second
     57 
     58 	# each invocation of this macro does 8 butterflies in parallel
     59 	.MACRO butterfly GROUP
     60 	# compute branch metrics
     61 	movq Branchtab29_sse+(8*\GROUP),%mm4
     62 	movq Branchtab29_sse+128+(8*\GROUP),%mm3
     63 	pxor %mm6,%mm4
     64 	pxor %mm5,%mm3
     65 	pavgb %mm3,%mm4			# mm4 contains branch metrics
     66 	psrlw $3,%mm4
     67 	pand %mm7,%mm4
     68 
     69 	movq (8*\GROUP)(%esi),%mm0	# Incoming path metric, high bit = 0
     70 	movq ((8*\GROUP)+128)(%esi),%mm3	# Incoming path metric, high bit = 1
     71 	movq %mm0,%mm2
     72 	movq %mm3,%mm1
     73 	paddusb %mm4,%mm0
     74 	paddusb %mm4,%mm3
     75 
     76 	# invert branch metrics. This works only because they're 5 bits
     77 	pxor %mm7,%mm4
     78 
     79 	paddusb %mm4,%mm1
     80 	paddusb %mm4,%mm2
     81 
     82 	# Find survivors, leave in mm0,2
     83 	pminub %mm1,%mm0
     84 	pminub %mm3,%mm2
     85 	# get decisions, leave in mm1,3
     86 	pcmpeqb %mm0,%mm1
     87 	pcmpeqb %mm2,%mm3
     88 
     89 	# interleave and store new branch metrics in mm0,2
     90 	movq %mm0,%mm4
     91 	punpckhbw %mm2,%mm0	# interleave second 8 new metrics
     92 	punpcklbw %mm2,%mm4	# interleave first 8 new metrics
     93 	movq %mm0,(16*\GROUP+8)(%edi)
     94 	movq %mm4,(16*\GROUP)(%edi)
     95 
     96 	# interleave decisions, accumulate into %ebx
     97 	movq %mm1,%mm4
     98 	punpckhbw %mm3,%mm1
     99 	punpcklbw %mm3,%mm4
    100 	# Due to an error in the Intel instruction set ref (the register
    101 	# fields are swapped), gas assembles pmovmskb incorrectly
    102 	# See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
    103 	.byte 0x0f,0xd7,0xc1	# pmovmskb %mm1,%eax
    104 	shll $((16*\GROUP+8)&31),%eax
    105 	orl %eax,%ebx
    106 	.byte 0x0f,0xd7,0xc4	# pmovmskb %mm4,%eax
    107 	shll $((16*\GROUP)&31),%eax
    108 	orl %eax,%ebx
    109 	.endm
    110 
    111 	# invoke macro 16 times for a total of 128 butterflies
    112 	xorl %ebx,%ebx		# clear decisions
    113 	butterfly GROUP=0
    114 	butterfly GROUP=1
    115 	movl %ebx,(%edx)	# stash first 32 decisions
    116 	xorl %ebx,%ebx
    117 	butterfly GROUP=2
    118 	butterfly GROUP=3
    119 	movl %ebx,4(%edx)	# stash second 32 decisions
    120 	xorl %ebx,%ebx		# clear decisions
    121 	butterfly GROUP=4
    122 	butterfly GROUP=5
    123 	movl %ebx,8(%edx)	# stash first 32 decisions
    124 	xorl %ebx,%ebx
    125 	butterfly GROUP=6
    126 	butterfly GROUP=7
    127 	movl %ebx,12(%edx)	# stash second 32 decisions
    128 	xorl %ebx,%ebx		# clear decisions
    129 	butterfly GROUP=8
    130 	butterfly GROUP=9
    131 	movl %ebx,16(%edx)	# stash first 32 decisions
    132 	xorl %ebx,%ebx
    133 	butterfly GROUP=10
    134 	butterfly GROUP=11
    135 	movl %ebx,20(%edx)	# stash second 32 decisions
    136 	xorl %ebx,%ebx		# clear decisions
    137 	butterfly GROUP=12
    138 	butterfly GROUP=13
    139 	movl %ebx,24(%edx)	# stash first 32 decisions
    140 	xorl %ebx,%ebx
    141 	butterfly GROUP=14
    142 	butterfly GROUP=15
    143 	movl %ebx,28(%edx)	# stash second 32 decisions
    144 
    145 	addl $32,%edx		# bump decision pointer
    146 
    147 	# see if we have to normalize
    148 	movl (%edi),%eax	# extract first output metric
    149 	andl $255,%eax
    150 	cmp $50,%eax		# is it greater than 50?
    151 	movl $0,%eax
    152 	jle done		# No, no need to normalize
    153 
    154 	# Normalize by finding smallest metric and subtracting it
    155 	# from all metrics
    156 	movq (%edi),%mm0
    157 	pminub 8(%edi),%mm0
    158 	pminub 16(%edi),%mm0
    159 	pminub 24(%edi),%mm0
    160 	pminub 32(%edi),%mm0
    161 	pminub 40(%edi),%mm0
    162 	pminub 48(%edi),%mm0
    163 	pminub 56(%edi),%mm0
    164 	pminub 64(%edi),%mm0
    165 	pminub 72(%edi),%mm0
    166 	pminub 80(%edi),%mm0
    167 	pminub 88(%edi),%mm0
    168 	pminub 96(%edi),%mm0
    169 	pminub 104(%edi),%mm0
    170 	pminub 112(%edi),%mm0
    171 	pminub 120(%edi),%mm0
    172 	pminub 128(%edi),%mm0
    173 	pminub 136(%edi),%mm0
    174 	pminub 144(%edi),%mm0
    175 	pminub 152(%edi),%mm0
    176 	pminub 160(%edi),%mm0
    177 	pminub 168(%edi),%mm0
    178 	pminub 176(%edi),%mm0
    179 	pminub 184(%edi),%mm0
    180 	pminub 192(%edi),%mm0
    181 	pminub 200(%edi),%mm0
    182 	pminub 208(%edi),%mm0
    183 	pminub 216(%edi),%mm0
    184 	pminub 224(%edi),%mm0
    185 	pminub 232(%edi),%mm0
    186 	pminub 240(%edi),%mm0
    187 	pminub 248(%edi),%mm0
    188 	# mm0 contains 8 smallest metrics
    189 	# crunch down to single lowest metric
    190 	movq %mm0,%mm1
    191 	psrlq $32,%mm0
    192 	pminub %mm1,%mm0
    193 	movq %mm0,%mm1
    194 	psrlq $16,%mm0
    195 	pminub %mm1,%mm0
    196 	movq %mm0,%mm1
    197 	psrlq $8,%mm0
    198 	pminub %mm1,%mm0
    199 	movq 8(%edi),%mm1	# reload
    200 	punpcklbw %mm0,%mm0	# expand to all 8 bytes
    201 	pshufw $0,%mm0,%mm0
    202 
    203 	# mm0 now contains lowest metric in all 8 bytes
    204 	# subtract it from every output metric
    205 	# Trashes %mm7
    206 	.macro PSUBUSBM REG,MEM
    207 	movq \MEM,%mm7
    208 	psubusb \REG,%mm7
    209 	movq %mm7,\MEM
    210 	.endm
    211 
    212 	PSUBUSBM %mm0,(%edi)
    213 	PSUBUSBM %mm0,8(%edi)
    214 	PSUBUSBM %mm0,16(%edi)
    215 	PSUBUSBM %mm0,24(%edi)
    216 	PSUBUSBM %mm0,32(%edi)
    217 	PSUBUSBM %mm0,40(%edi)
    218 	PSUBUSBM %mm0,48(%edi)
    219 	PSUBUSBM %mm0,56(%edi)
    220 	PSUBUSBM %mm0,64(%edi)
    221 	PSUBUSBM %mm0,72(%edi)
    222 	PSUBUSBM %mm0,80(%edi)
    223 	PSUBUSBM %mm0,88(%edi)
    224 	PSUBUSBM %mm0,96(%edi)
    225 	PSUBUSBM %mm0,104(%edi)
    226 	PSUBUSBM %mm0,112(%edi)
    227 	PSUBUSBM %mm0,120(%edi)
    228 	PSUBUSBM %mm0,128(%edi)
    229 	PSUBUSBM %mm0,136(%edi)
    230 	PSUBUSBM %mm0,144(%edi)
    231 	PSUBUSBM %mm0,152(%edi)
    232 	PSUBUSBM %mm0,160(%edi)
    233 	PSUBUSBM %mm0,168(%edi)
    234 	PSUBUSBM %mm0,176(%edi)
    235 	PSUBUSBM %mm0,184(%edi)
    236 	PSUBUSBM %mm0,192(%edi)
    237 	PSUBUSBM %mm0,200(%edi)
    238 	PSUBUSBM %mm0,208(%edi)
    239 	PSUBUSBM %mm0,216(%edi)
    240 	PSUBUSBM %mm0,224(%edi)
    241 	PSUBUSBM %mm0,232(%edi)
    242 	PSUBUSBM %mm0,240(%edi)
    243 	PSUBUSBM %mm0,248(%edi)
    244 
    245 done:
    246 	# swap metrics
    247 	movl %esi,%eax
    248 	movl %edi,%esi
    249 	movl %eax,%edi
    250 	jmp 1b
    251 
    252 2:	emms
    253 	movl 8(%ebp),%ebx	# ebx = vp
    254 	# stash metric pointers
    255 	movl %esi,OLDMETRICS(%ebx)
    256 	movl %edi,NEWMETRICS(%ebx)
    257 	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
    258 	xorl %eax,%eax
    259 err:	popl %ebx
    260 	popl %edx
    261 	popl %edi
    262 	popl %esi
    263 	popl %ebp
    264 	ret
    265 
    266 	.data
    267 	.align 8
    268 thirtyones:
    269 	.byte 31,31,31,31,31,31,31,31
    270 
    271 
    272