Home | History | Annotate | Download | only in fec
      1 /* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies
      2    for 64-state (k=7) convolutional code
      3    Copyright 2003 Phil Karn, KA9Q
      4    This code may be used under the terms of the GNU Lesser General Public License (LGPL)
      5 
      6    void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ;
      7 */
      8 	# SSE2 (128-bit integer SIMD) version
      9 	# Requires Pentium 4 or better
     10 
     11 	# These are offsets into struct v27, defined in viterbi27.h
     12 	.set DP,128
     13 	.set OLDMETRICS,132
     14 	.set NEWMETRICS,136
     15 	.text
     16 	.global update_viterbi27_blk_sse2,Branchtab27_sse2
     17 	.type update_viterbi27_blk_sse2,@function
     18 	.align 16
     19 
     20 update_viterbi27_blk_sse2:
     21 	pushl %ebp
     22 	movl %esp,%ebp
     23 	pushl %esi
     24 	pushl %edi
     25 	pushl %edx
     26 	pushl %ebx
     27 
     28 	movl 8(%ebp),%edx	# edx = vp
     29 	testl %edx,%edx
     30 	jnz  0f
     31 	movl -1,%eax
     32 	jmp  err
     33 0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
     34 	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
     35 	movl DP(%edx),%edx	# edx -> decisions
     36 
     37 1:	movl 16(%ebp),%eax	# eax = nbits
     38 	decl %eax
     39 	jl   2f			# passed zero, we're done
     40 	movl %eax,16(%ebp)
     41 
     42 	xorl %eax,%eax
     43 	movl 12(%ebp),%ebx	# ebx = syms
     44 	movb (%ebx),%al
     45 	movd %eax,%xmm6		# xmm6[0] = first symbol
     46 	movb 1(%ebx),%al
     47 	movd %eax,%xmm5		# xmm5[0] = second symbol
     48 	addl $2,%ebx
     49 	movl %ebx,12(%ebp)
     50 
     51 	punpcklbw %xmm6,%xmm6	# xmm6[1] = xmm6[0]
     52 	punpcklbw %xmm5,%xmm5
     53 	pshuflw $0,%xmm6,%xmm6	# copy low word to low 3
     54 	pshuflw $0,%xmm5,%xmm5
     55 	punpcklqdq %xmm6,%xmm6  # propagate to all 16
     56 	punpcklqdq %xmm5,%xmm5
     57 	# xmm6 now contains first symbol in each byte, xmm5 the second
     58 
     59 	movdqa thirtyones,%xmm7
     60 
     61 	# each invocation of this macro does 16 butterflies in parallel
     62 	.MACRO butterfly GROUP
     63 	# compute branch metrics
     64 	movdqa Branchtab27_sse2+(16*\GROUP),%xmm4
     65 	movdqa Branchtab27_sse2+32+(16*\GROUP),%xmm3
     66 	pxor %xmm6,%xmm4
     67 	pxor %xmm5,%xmm3
     68 
     69 	# compute 5-bit branch metric in xmm4 by adding the individual symbol metrics
     70 	# This is okay for this
     71 	# code because the worst-case metric spread (at high Eb/No) is only 120,
     72 	# well within the range of our unsigned 8-bit path metrics, and even within
     73 	# the range of signed 8-bit path metrics
     74 	pavgb %xmm3,%xmm4
     75 	psrlw $3,%xmm4
     76 
     77 	pand %xmm7,%xmm4
     78 
     79 	movdqa (16*\GROUP)(%esi),%xmm0	# Incoming path metric, high bit = 0
     80 	movdqa ((16*\GROUP)+32)(%esi),%xmm3	# Incoming path metric, high bit = 1
     81 	movdqa %xmm0,%xmm2
     82 	movdqa %xmm3,%xmm1
     83 	paddusb %xmm4,%xmm0	# note use of saturating arithmetic
     84 	paddusb %xmm4,%xmm3	# this shouldn't be necessary, but why not?
     85 
     86 	# negate branch metrics
     87 	pxor %xmm7,%xmm4
     88 	paddusb %xmm4,%xmm1
     89 	paddusb %xmm4,%xmm2
     90 
     91 	# Find survivors, leave in mm0,2
     92 	pminub %xmm1,%xmm0
     93 	pminub %xmm3,%xmm2
     94 	# get decisions, leave in mm1,3
     95 	pcmpeqb %xmm0,%xmm1
     96 	pcmpeqb %xmm2,%xmm3
     97 
     98 	# interleave and store new branch metrics in mm0,2
     99 	movdqa %xmm0,%xmm4
    100 	punpckhbw %xmm2,%xmm0	# interleave second 16 new metrics
    101 	punpcklbw %xmm2,%xmm4	# interleave first 16 new metrics
    102 	movdqa %xmm0,(32*\GROUP+16)(%edi)
    103 	movdqa %xmm4,(32*\GROUP)(%edi)
    104 
    105 	# interleave decisions & store
    106 	movdqa %xmm1,%xmm4
    107 	punpckhbw %xmm3,%xmm1
    108 	punpcklbw %xmm3,%xmm4
    109 	# work around bug in gas due to Intel doc error
    110 	.byte 0x66,0x0f,0xd7,0xd9	# pmovmskb %xmm1,%ebx
    111 	shll $16,%ebx
    112 	.byte 0x66,0x0f,0xd7,0xc4	# pmovmskb %xmm4,%eax
    113 	orl %eax,%ebx
    114 	movl %ebx,(4*\GROUP)(%edx)
    115 	.endm
    116 
    117 	# invoke macro 2 times for a total of 32 butterflies
    118 	butterfly GROUP=0
    119 	butterfly GROUP=1
    120 
    121 	addl $8,%edx		# bump decision pointer
    122 
    123 	# See if we have to normalize. This requires an explanation. We don't want
    124 	# our path metrics to exceed 255 on the *next* iteration. Since the
    125 	# largest branch metric is 30, that means we don't want any to exceed 225
    126 	# on *this* iteration. Rather than look them all, we just pick an arbitrary one
    127 	# (the first) and see if it exceeds 225-120=105, where 120 is the experimentally-
    128 	# determined worst-case metric spread for this code and branch metrics in the range 0-30.
    129 
    130 	# This is extremely conservative, and empirical testing at a variety of Eb/Nos might
    131 	# show that a higher threshold could be used without affecting BER performance
    132 	movl (%edi),%eax	# extract first output metric
    133 	andl $255,%eax
    134 	cmp $105,%eax
    135 	jle done		# No, no need to normalize
    136 
    137 	# Normalize by finding smallest metric and subtracting it
    138 	# from all metrics. We can't just pick an arbitrary small constant because
    139 	# the minimum metric might be zero!
    140 	movdqa (%edi),%xmm0
    141 	movdqa %xmm0,%xmm4
    142 	movdqa 16(%edi),%xmm1
    143 	pminub %xmm1,%xmm4
    144 	movdqa 32(%edi),%xmm2
    145 	pminub %xmm2,%xmm4
    146 	movdqa 48(%edi),%xmm3
    147 	pminub %xmm3,%xmm4
    148 
    149 	# crunch down to single lowest metric
    150 	movdqa %xmm4,%xmm5
    151 	psrldq $8,%xmm5     # the count to psrldq is bytes, not bits!
    152 	pminub %xmm5,%xmm4
    153 	movdqa %xmm4,%xmm5
    154 	psrlq $32,%xmm5
    155 	pminub %xmm5,%xmm4
    156 	movdqa %xmm4,%xmm5
    157 	psrlq $16,%xmm5
    158 	pminub %xmm5,%xmm4
    159 	movdqa %xmm4,%xmm5
    160 	psrlq $8,%xmm5
    161 	pminub %xmm5,%xmm4	# now in lowest byte of %xmm4
    162 
    163 	punpcklbw %xmm4,%xmm4	# lowest 2 bytes
    164 	pshuflw $0,%xmm4,%xmm4  # lowest 8 bytes
    165 	punpcklqdq %xmm4,%xmm4	# all 16 bytes
    166 
    167 	# xmm4 now contains lowest metric in all 16 bytes
    168 	# subtract it from every output metric
    169 	psubusb %xmm4,%xmm0
    170 	psubusb %xmm4,%xmm1
    171 	psubusb %xmm4,%xmm2
    172 	psubusb %xmm4,%xmm3
    173 	movdqa %xmm0,(%edi)
    174 	movdqa %xmm1,16(%edi)
    175 	movdqa %xmm2,32(%edi)
    176 	movdqa %xmm3,48(%edi)
    177 
    178 done:
    179 	# swap metrics
    180 	movl %esi,%eax
    181 	movl %edi,%esi
    182 	movl %eax,%edi
    183 	jmp 1b
    184 
    185 2:	movl 8(%ebp),%ebx	# ebx = vp
    186 	# stash metric pointers
    187 	movl %esi,OLDMETRICS(%ebx)
    188 	movl %edi,NEWMETRICS(%ebx)
    189 	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
    190 	xorl %eax,%eax
    191 err:	popl %ebx
    192 	popl %edx
    193 	popl %edi
    194 	popl %esi
    195 	popl %ebp
    196 	ret
    197 
    198 	.data
    199 	.align 16
    200 
    201 thirtyones:
    202 	.byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
    203