1 /* K=15 r=1/6 Viterbi decoder for x86 SSE2 2 * Copyright Mar 2004, Phil Karn, KA9Q 3 * May be used under the terms of the GNU Lesser General Public License (LGPL) 4 */ 5 #include <emmintrin.h> 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <memory.h> 9 #include <limits.h> 10 #include "fec.h" 11 12 typedef union { unsigned long w[8]; unsigned short s[16];} decision_t; 13 typedef union { signed short s[256]; __m128i v[32];} metric_t; 14 15 static union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3]; 16 static int Init = 0; 17 18 /* State info for instance of Viterbi decoder */ 19 struct v39 { 20 metric_t metrics1; /* path metric buffer 1 */ 21 metric_t metrics2; /* path metric buffer 2 */ 22 void *dp; /* Pointer to current decision */ 23 metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ 24 void *decisions; /* Beginning of decisions for block */ 25 }; 26 27 /* Initialize Viterbi decoder for start of new frame */ 28 int init_viterbi39_sse2(void *p,int starting_state){ 29 struct v39 *vp = p; 30 int i; 31 32 for(i=0;i<256;i++) 33 vp->metrics1.s[i] = (SHRT_MIN+1000); 34 35 vp->old_metrics = &vp->metrics1; 36 vp->new_metrics = &vp->metrics2; 37 vp->dp = vp->decisions; 38 vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */ 39 return 0; 40 } 41 42 /* Create a new instance of a Viterbi decoder */ 43 void *create_viterbi39_sse2(int len){ 44 void *p; 45 struct v39 *vp; 46 47 if(!Init){ 48 int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; 49 50 set_viterbi39_polynomial_sse2(polys); 51 } 52 /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ 53 if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39))) 54 return NULL; 55 56 vp = (struct v39 *)p; 57 if((p = malloc((len+8)*sizeof(decision_t))) == NULL){ 58 free(vp); 59 return NULL; 60 } 61 vp->decisions = (decision_t *)p; 62 init_viterbi39_sse2(vp,0); 63 return vp; 64 } 65 66 void set_viterbi39_polynomial_sse2(int polys[3]){ 67 int state; 68 69 for(state=0;state < 128;state++){ 70 Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; 71 Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; 72 Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; 73 } 74 Init++; 75 } 76 77 /* Viterbi chainback */ 78 int chainback_viterbi39_sse2( 79 void *p, 80 unsigned char *data, /* Decoded output data */ 81 unsigned int nbits, /* Number of data bits */ 82 unsigned int endstate){ /* Terminal encoder state */ 83 struct v39 *vp = p; 84 decision_t *d = (decision_t *)vp->decisions; 85 int path_metric; 86 87 endstate %= 256; 88 89 path_metric = vp->old_metrics->s[endstate]; 90 91 /* The store into data[] only needs to be done every 8 bits. 92 * But this avoids a conditional branch, and the writes will 93 * combine in the cache anyway 94 */ 95 d += 8; /* Look past tail */ 96 while(nbits-- != 0){ 97 int k; 98 99 k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1; 100 endstate = (k << 7) | (endstate >> 1); 101 data[nbits>>3] = endstate; 102 } 103 return path_metric; 104 } 105 106 /* Delete instance of a Viterbi decoder */ 107 void delete_viterbi39_sse2(void *p){ 108 struct v39 *vp = p; 109 110 if(vp != NULL){ 111 free(vp->decisions); 112 free(vp); 113 } 114 } 115 116 117 int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){ 118 struct v39 *vp = p; 119 decision_t *d = (decision_t *)vp->dp; 120 int path_metric = 0; 121 122 while(nbits--){ 123 __m128i sym0v,sym1v,sym2v; 124 void *tmp; 125 int i; 126 127 /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ 128 sym0v = _mm_set1_epi16(syms[0]); 129 sym1v = _mm_set1_epi16(syms[1]); 130 sym2v = _mm_set1_epi16(syms[2]); 131 syms += 3; 132 133 /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */ 134 for(i=0;i<16;i++){ 135 __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; 136 137 /* Form branch metrics 138 * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, 139 * the XOR operations constitute conditional negation. 140 * metric and m_metric (-metric) are in the range 0-765 141 */ 142 m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v)); 143 metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0); 144 m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric); 145 146 /* Add branch metrics to path metrics */ 147 m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric); 148 m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric); 149 m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric); 150 m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric); 151 152 /* Compare and select */ 153 survivor0 = _mm_min_epi16(m0,m1); 154 survivor1 = _mm_min_epi16(m2,m3); 155 decision0 = _mm_cmpeq_epi16(survivor0,m1); 156 decision1 = _mm_cmpeq_epi16(survivor1,m3); 157 158 /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */ 159 d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128()))); 160 161 /* Store surviving metrics */ 162 vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1); 163 vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1); 164 } 165 /* See if we need to renormalize */ 166 if(vp->new_metrics->s[0] >= SHRT_MAX-5000){ 167 int i,adjust; 168 __m128i adjustv; 169 union { __m128i v; signed short w[8]; } t; 170 171 /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ 172 adjustv = vp->new_metrics->v[0]; 173 for(i=1;i<32;i++) 174 adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]); 175 176 adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8)); 177 adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4)); 178 adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2)); 179 t.v = adjustv; 180 adjust = t.w[0] - SHRT_MIN; 181 path_metric += adjust; 182 adjustv = _mm_set1_epi16(adjust); 183 184 /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX 185 * This is okay since it can't overflow anyway 186 */ 187 for(i=0;i<32;i++) 188 vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv); 189 } 190 d++; 191 /* Swap pointers to old and new metrics */ 192 tmp = vp->old_metrics; 193 vp->old_metrics = vp->new_metrics; 194 vp->new_metrics = tmp; 195 } 196 vp->dp = d; 197 return path_metric; 198 } 199 200 201