Home | History | Annotate | Download | only in fec
      1 /* Intel SIMD (SSE) implementation of Viterbi ACS butterflies
      2    for 64-state (k=7) convolutional code
      3    Copyright 2001 Phil Karn, KA9Q
      4    This code may be used under the terms of the GNU Lesser General Public License (LGPL)
      5 
      6    int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ;
      7 */
      8 
      9 	# SSE (64-bit integer SIMD) version
     10 	# Requires Pentium III or better
     11 
     12 	# These are offsets into struct v27, defined in viterbi27.h
     13 	.set DP,128
     14 	.set OLDMETRICS,132
     15 	.set NEWMETRICS,136
     16 .text
     17 .global update_viterbi27_blk_sse,Branchtab27_sse
     18 	.type update_viterbi27_blk_sse,@function
     19 	.align 16
     20 
     21 update_viterbi27_blk_sse:
     22 	pushl %ebp
     23 	movl %esp,%ebp
     24 	pushl %esi
     25 	pushl %edi
     26 	pushl %edx
     27 	pushl %ebx
     28 
     29 	movl 8(%ebp),%edx	# edx = vp
     30 	testl %edx,%edx
     31 	jnz  0f
     32 	movl -1,%eax
     33 	jmp  err
     34 0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
     35 	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
     36 	movl DP(%edx),%edx	# edx -> decisions
     37 
     38 1:	movl 16(%ebp),%eax	# eax = nbits
     39 	decl %eax
     40 	jl   2f			# passed zero, we're done
     41 	movl %eax,16(%ebp)
     42 
     43 	xorl %eax,%eax
     44 	movl 12(%ebp),%ebx	# %ebx = syms
     45 	movb (%ebx),%al
     46 	movd %eax,%mm6		# mm6[0] = first symbol
     47 	movb 1(%ebx),%al
     48 	movd %eax,%mm5		# mm5[0] = second symbol
     49 	addl $2,%ebx
     50 	movl %ebx,12(%ebp)
     51 
     52 	punpcklbw %mm6,%mm6	# mm6[1] = mm6[0]
     53 	punpcklbw %mm5,%mm5
     54 	movq thirtyones,%mm7
     55 
     56 	pshufw $0,%mm6,%mm6	# copy low word to upper 3
     57 	pshufw $0,%mm5,%mm5
     58 	# mm6 now contains first symbol in each byte, mm5 the second
     59 
     60 	# each invocation of this macro does 8 butterflies in parallel
     61 	.MACRO butterfly GROUP
     62 	# compute branch metrics
     63 	movq Branchtab27_sse+(8*\GROUP),%mm4
     64 	movq Branchtab27_sse+32+(8*\GROUP),%mm3
     65 	pxor %mm6,%mm4
     66 	pxor %mm5,%mm3
     67 	pavgb %mm3,%mm4			# mm4 contains branch metrics
     68 	psrlw $3,%mm4
     69 	pand %mm7,%mm4
     70 
     71 	movq (8*\GROUP)(%esi),%mm0	# Incoming path metric, high bit = 0
     72 	movq ((8*\GROUP)+32)(%esi),%mm3	# Incoming path metric, high bit = 1
     73 	movq %mm0,%mm2
     74 	movq %mm3,%mm1
     75 	paddusb %mm4,%mm0
     76 	paddusb %mm4,%mm3
     77 
     78 	# invert branch metrics. This works only because they're 5 bits
     79 	pxor %mm7,%mm4
     80 
     81 	paddusb %mm4,%mm1
     82 	paddusb %mm4,%mm2
     83 
     84 	# Find survivors, leave in mm0,2
     85 	pminub %mm1,%mm0
     86 	pminub %mm3,%mm2
     87 	# get decisions, leave in mm1,3
     88 	pcmpeqb %mm0,%mm1
     89 	pcmpeqb %mm2,%mm3
     90 
     91 	# interleave and store new branch metrics in mm0,2
     92 	movq %mm0,%mm4
     93 	punpckhbw %mm2,%mm0	# interleave second 8 new metrics
     94 	punpcklbw %mm2,%mm4	# interleave first 8 new metrics
     95 	movq %mm0,(16*\GROUP+8)(%edi)
     96 	movq %mm4,(16*\GROUP)(%edi)
     97 
     98 	# interleave decisions, accumulate into %ebx
     99 	movq %mm1,%mm4
    100 	punpckhbw %mm3,%mm1
    101 	punpcklbw %mm3,%mm4
    102 	# Due to an error in the Intel instruction set ref (the register
    103 	# fields are swapped), gas assembles pmovmskb incorrectly
    104 	# See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
    105 	.byte 0x0f,0xd7,0xc1	# pmovmskb %mm1,%eax
    106 	shll $((16*\GROUP+8)&31),%eax
    107 	orl %eax,%ebx
    108 	.byte 0x0f,0xd7,0xc4	# pmovmskb %mm4,%eax
    109 	shll $((16*\GROUP)&31),%eax
    110 	orl %eax,%ebx
    111 	.endm
    112 
    113 	# invoke macro 4 times for a total of 32 butterflies
    114 	xorl %ebx,%ebx		# clear decisions
    115 	butterfly GROUP=0
    116 	butterfly GROUP=1
    117 	movl %ebx,(%edx)	# stash first 32 decisions
    118 	xorl %ebx,%ebx
    119 	butterfly GROUP=2
    120 	butterfly GROUP=3
    121 	movl %ebx,4(%edx)	# stash second 32 decisions
    122 
    123 	addl $8,%edx		# bump decision pointer
    124 
    125 	# see if we have to normalize
    126 	movl (%edi),%eax	# extract first output metric
    127 	andl $255,%eax
    128 	cmpl $150,%eax		# is it greater than 150?
    129 	movl $0,%eax
    130 	jle done		# No, no need to normalize
    131 
    132 	# Normalize by finding smallest metric and subtracting it
    133 	# from all metrics
    134 	movq (%edi),%mm0
    135 	pminub 8(%edi),%mm0
    136 	pminub 16(%edi),%mm0
    137 	pminub 24(%edi),%mm0
    138 	pminub 32(%edi),%mm0
    139 	pminub 40(%edi),%mm0
    140 	pminub 48(%edi),%mm0
    141 	pminub 56(%edi),%mm0
    142 	# mm0 contains 8 smallest metrics
    143 	# crunch down to single lowest metric
    144 	movq %mm0,%mm1
    145 	psrlq $32,%mm0
    146 	pminub %mm1,%mm0
    147 	movq %mm0,%mm1
    148 	psrlq $16,%mm0
    149 	pminub %mm1,%mm0
    150 	movq %mm0,%mm1
    151 	psrlq $8,%mm0
    152 	pminub %mm1,%mm0
    153 	punpcklbw %mm0,%mm0	# expand to all 8 bytes
    154 	pshufw $0,%mm0,%mm0
    155 
    156 	# mm0 now contains lowest metric in all 8 bytes
    157 	# subtract it from every output metric
    158 	# Trashes %mm7
    159 	.macro PSUBUSBM REG,MEM
    160 	movq \MEM,%mm7
    161 	psubusb \REG,%mm7
    162 	movq %mm7,\MEM
    163 	.endm
    164 
    165 	PSUBUSBM %mm0,(%edi)
    166 	PSUBUSBM %mm0,8(%edi)
    167 	PSUBUSBM %mm0,16(%edi)
    168 	PSUBUSBM %mm0,24(%edi)
    169 	PSUBUSBM %mm0,32(%edi)
    170 	PSUBUSBM %mm0,40(%edi)
    171 	PSUBUSBM %mm0,48(%edi)
    172 	PSUBUSBM %mm0,56(%edi)
    173 
    174 	movd %mm0,%eax
    175 	and $0xff,%eax
    176 
    177 done:	# swap metrics
    178 	movl %esi,%eax
    179 	movl %edi,%esi
    180 	movl %eax,%edi
    181 	jmp 1b
    182 
    183 2:	emms
    184 	movl 8(%ebp),%ebx	# ebx = vp
    185 	# stash metric pointers
    186 	movl %esi,OLDMETRICS(%ebx)
    187 	movl %edi,NEWMETRICS(%ebx)
    188 	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
    189 	xorl %eax,%eax
    190 err:	popl %ebx
    191 	popl %edx
    192 	popl %edi
    193 	popl %esi
    194 	popl %ebp
    195 
    196 	ret
    197 
    198 	.data
    199 
    200 	.align 16
    201 thirtyones:
    202 	.byte 31,31,31,31,31,31,31,31
    203 
    204 
    205 
    206