1 /* Intel SIMD SSE2 implementation of Viterbi ACS butterflies 2 for 256-state (k=9) convolutional code 3 Copyright 2004 Phil Karn, KA9Q 4 This code may be used under the terms of the GNU Lesser General Public License (LGPL) 5 6 void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ; 7 */ 8 9 # SSE2 (128-bit integer SIMD) version 10 # Requires Pentium 4 or better 11 # These are offsets into struct v29, defined in viterbi29.h 12 .set DP,512 13 .set OLDMETRICS,516 14 .set NEWMETRICS,520 15 16 .text 17 .global update_viterbi29_blk_sse2,Branchtab29_sse2 18 .type update_viterbi29_blk_sse2,@function 19 .align 16 20 21 update_viterbi29_blk_sse2: 22 pushl %ebp 23 movl %esp,%ebp 24 pushl %esi 25 pushl %edi 26 pushl %edx 27 pushl %ebx 28 29 movl 8(%ebp),%edx # edx = vp 30 testl %edx,%edx 31 jnz 0f 32 movl -1,%eax 33 jmp err 34 0: movl OLDMETRICS(%edx),%esi # esi -> old metrics 35 movl NEWMETRICS(%edx),%edi # edi -> new metrics 36 movl DP(%edx),%edx # edx -> decisions 37 38 1: movl 16(%ebp),%eax # eax = nbits 39 decl %eax 40 jl 2f # passed zero, we're done 41 movl %eax,16(%ebp) 42 43 xorl %eax,%eax 44 movl 12(%ebp),%ebx # ebx = syms 45 movb (%ebx),%al 46 movd %eax,%xmm6 # xmm6[0] = first symbol 47 movb 1(%ebx),%al 48 movd %eax,%xmm5 # xmm5[0] = second symbol 49 addl $2,%ebx 50 movl %ebx,12(%ebp) 51 52 punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] 53 punpcklbw %xmm5,%xmm5 54 movdqa thirtyones,%xmm7 55 pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 56 pshuflw $0,%xmm5,%xmm5 57 punpcklqdq %xmm6,%xmm6 # propagate to all 16 58 punpcklqdq %xmm5,%xmm5 59 # xmm6 now contains first symbol in each byte, xmm5 the second 60 61 movdqa thirtyones,%xmm7 62 63 # each invocation of this macro does 16 butterflies in parallel 64 .MACRO butterfly GROUP 65 # compute branch metrics 66 movdqa Branchtab29_sse2+(16*\GROUP),%xmm4 67 movdqa Branchtab29_sse2+128+(16*\GROUP),%xmm3 68 pxor %xmm6,%xmm4 69 pxor %xmm5,%xmm3 70 pavgb %xmm3,%xmm4 71 psrlw $3,%xmm4 72 73 pand %xmm7,%xmm4 # xmm4 contains branch metrics 74 75 movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 76 movdqa ((16*\GROUP)+128)(%esi),%xmm3 # Incoming path metric, high bit = 1 77 movdqa %xmm0,%xmm2 78 movdqa %xmm3,%xmm1 79 paddusb %xmm4,%xmm0 80 paddusb %xmm4,%xmm3 81 82 # invert branch metrics 83 pxor %xmm7,%xmm4 84 85 paddusb %xmm4,%xmm1 86 paddusb %xmm4,%xmm2 87 88 # Find survivors, leave in mm0,2 89 pminub %xmm1,%xmm0 90 pminub %xmm3,%xmm2 91 # get decisions, leave in mm1,3 92 pcmpeqb %xmm0,%xmm1 93 pcmpeqb %xmm2,%xmm3 94 95 # interleave and store new branch metrics in mm0,2 96 movdqa %xmm0,%xmm4 97 punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics 98 punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics 99 movdqa %xmm0,(32*\GROUP+16)(%edi) 100 movdqa %xmm4,(32*\GROUP)(%edi) 101 102 # interleave decisions & store 103 movdqa %xmm1,%xmm4 104 punpckhbw %xmm3,%xmm1 105 punpcklbw %xmm3,%xmm4 106 # work around bug in gas due to Intel doc error 107 .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx 108 shll $16,%ebx 109 .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax 110 orl %eax,%ebx 111 movl %ebx,(4*\GROUP)(%edx) 112 .endm 113 114 # invoke macro 8 times for a total of 128 butterflies 115 butterfly GROUP=0 116 butterfly GROUP=1 117 butterfly GROUP=2 118 butterfly GROUP=3 119 butterfly GROUP=4 120 butterfly GROUP=5 121 butterfly GROUP=6 122 butterfly GROUP=7 123 124 addl $32,%edx # bump decision pointer 125 126 # see if we have to normalize 127 movl (%edi),%eax # extract first output metric 128 andl $255,%eax 129 cmp $50,%eax # is it greater than 50? 130 movl $0,%eax 131 jle done # No, no need to normalize 132 133 # Normalize by finding smallest metric and subtracting it 134 # from all metrics 135 movdqa (%edi),%xmm0 136 pminub 16(%edi),%xmm0 137 pminub 32(%edi),%xmm0 138 pminub 48(%edi),%xmm0 139 pminub 64(%edi),%xmm0 140 pminub 80(%edi),%xmm0 141 pminub 96(%edi),%xmm0 142 pminub 112(%edi),%xmm0 143 pminub 128(%edi),%xmm0 144 pminub 144(%edi),%xmm0 145 pminub 160(%edi),%xmm0 146 pminub 176(%edi),%xmm0 147 pminub 192(%edi),%xmm0 148 pminub 208(%edi),%xmm0 149 pminub 224(%edi),%xmm0 150 pminub 240(%edi),%xmm0 151 152 # crunch down to single lowest metric 153 movdqa %xmm0,%xmm1 154 psrldq $8,%xmm0 # the count to psrldq is bytes, not bits! 155 pminub %xmm1,%xmm0 156 movdqa %xmm0,%xmm1 157 psrlq $32,%xmm0 158 pminub %xmm1,%xmm0 159 movdqa %xmm0,%xmm1 160 psrlq $16,%xmm0 161 pminub %xmm1,%xmm0 162 movdqa %xmm0,%xmm1 163 psrlq $8,%xmm0 164 pminub %xmm1,%xmm0 165 166 punpcklbw %xmm0,%xmm0 # lowest 2 bytes 167 pshuflw $0,%xmm0,%xmm0 # lowest 8 bytes 168 punpcklqdq %xmm0,%xmm0 # all 16 bytes 169 170 # xmm0 now contains lowest metric in all 16 bytes 171 # subtract it from every output metric 172 movdqa (%edi),%xmm1 173 psubusb %xmm0,%xmm1 174 movdqa %xmm1,(%edi) 175 movdqa 16(%edi),%xmm1 176 psubusb %xmm0,%xmm1 177 movdqa %xmm1,16(%edi) 178 movdqa 32(%edi),%xmm1 179 psubusb %xmm0,%xmm1 180 movdqa %xmm1,32(%edi) 181 movdqa 48(%edi),%xmm1 182 psubusb %xmm0,%xmm1 183 movdqa %xmm1,48(%edi) 184 movdqa 64(%edi),%xmm1 185 psubusb %xmm0,%xmm1 186 movdqa %xmm1,64(%edi) 187 movdqa 80(%edi),%xmm1 188 psubusb %xmm0,%xmm1 189 movdqa %xmm1,80(%edi) 190 movdqa 96(%edi),%xmm1 191 psubusb %xmm0,%xmm1 192 movdqa %xmm1,96(%edi) 193 movdqa 112(%edi),%xmm1 194 psubusb %xmm0,%xmm1 195 movdqa %xmm1,112(%edi) 196 movdqa 128(%edi),%xmm1 197 psubusb %xmm0,%xmm1 198 movdqa %xmm1,128(%edi) 199 movdqa 144(%edi),%xmm1 200 psubusb %xmm0,%xmm1 201 movdqa %xmm1,144(%edi) 202 movdqa 160(%edi),%xmm1 203 psubusb %xmm0,%xmm1 204 movdqa %xmm1,160(%edi) 205 movdqa 176(%edi),%xmm1 206 psubusb %xmm0,%xmm1 207 movdqa %xmm1,176(%edi) 208 movdqa 192(%edi),%xmm1 209 psubusb %xmm0,%xmm1 210 movdqa %xmm1,192(%edi) 211 movdqa 208(%edi),%xmm1 212 psubusb %xmm0,%xmm1 213 movdqa %xmm1,208(%edi) 214 movdqa 224(%edi),%xmm1 215 psubusb %xmm0,%xmm1 216 movdqa %xmm1,224(%edi) 217 movdqa 240(%edi),%xmm1 218 psubusb %xmm0,%xmm1 219 movdqa %xmm1,240(%edi) 220 221 done: 222 # swap metrics 223 movl %esi,%eax 224 movl %edi,%esi 225 movl %eax,%edi 226 jmp 1b 227 228 2: movl 8(%ebp),%ebx # ebx = vp 229 # stash metric pointers 230 movl %esi,OLDMETRICS(%ebx) 231 movl %edi,NEWMETRICS(%ebx) 232 movl %edx,DP(%ebx) # stash incremented value of vp->dp 233 xorl %eax,%eax 234 err: popl %ebx 235 popl %edx 236 popl %edi 237 popl %esi 238 popl %ebp 239 ret 240 241 .data 242 .align 16 243 thirtyones: 244 .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 245 246