1 /* K=15 r=1/6 Viterbi decoder for x86 SSE 2 * Copyright Mar 2004, Phil Karn, KA9Q 3 * May be used under the terms of the GNU Lesser General Public License (LGPL) 4 */ 5 #include <xmmintrin.h> 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <memory.h> 9 #include <limits.h> 10 #include "fec.h" 11 12 typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t; 13 typedef union { signed short s[16384]; __m64 v[4096];} metric_t; 14 15 static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6]; 16 static int Init = 0; 17 18 /* State info for instance of Viterbi decoder */ 19 struct v615 { 20 metric_t metrics1; /* path metric buffer 1 */ 21 metric_t metrics2; /* path metric buffer 2 */ 22 void *dp; /* Pointer to current decision */ 23 metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ 24 void *decisions; /* Beginning of decisions for block */ 25 }; 26 27 /* Initialize Viterbi decoder for start of new frame */ 28 int init_viterbi615_sse(void *p,int starting_state){ 29 struct v615 *vp = p; 30 int i; 31 32 if(p == NULL) 33 return -1; 34 for(i=0;i<16384;i++) 35 vp->metrics1.s[i] = (SHRT_MIN+5000); 36 37 vp->old_metrics = &vp->metrics1; 38 vp->new_metrics = &vp->metrics2; 39 vp->dp = vp->decisions; 40 vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */ 41 return 0; 42 } 43 44 /* Create a new instance of a Viterbi decoder */ 45 void *create_viterbi615_sse(int len){ 46 struct v615 *vp; 47 48 if(!Init){ 49 int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; 50 set_viterbi615_polynomial_sse(polys); 51 } 52 53 if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL){ 54 return NULL; 55 } 56 if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ 57 free(vp); 58 return NULL; 59 } 60 init_viterbi615_sse(vp,0); 61 return vp; 62 } 63 64 void set_viterbi615_polynomial_sse(int polys[6]){ 65 int state; 66 int i; 67 68 for(state=0;state < 8192;state++){ 69 for(i=0;i<6;i++) 70 Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; 71 } 72 Init++; 73 } 74 75 /* Viterbi chainback */ 76 int chainback_viterbi615_sse( 77 void *p, 78 unsigned char *data, /* Decoded output data */ 79 unsigned int nbits, /* Number of data bits */ 80 unsigned int endstate){ /* Terminal encoder state */ 81 struct v615 *vp = p; 82 decision_t *d; 83 84 if(p == NULL) 85 return -1; 86 d = (decision_t *)vp->decisions; 87 endstate %= 16384; 88 89 /* The store into data[] only needs to be done every 8 bits. 90 * But this avoids a conditional branch, and the writes will 91 * combine in the cache anyway 92 */ 93 d += 14; /* Look past tail */ 94 while(nbits-- != 0){ 95 int k; 96 97 /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ 98 k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; 99 endstate = (k << 13) | (endstate >> 1); 100 data[nbits>>3] = endstate >> 6; 101 } 102 return 0; 103 } 104 105 /* Delete instance of a Viterbi decoder */ 106 void delete_viterbi615_sse(void *p){ 107 struct v615 *vp = p; 108 109 if(vp != NULL){ 110 free(vp->decisions); 111 free(vp); 112 } 113 } 114 115 116 int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits){ 117 struct v615 *vp = p; 118 decision_t *d; 119 120 if(p == NULL) 121 return -1; 122 d = (decision_t *)vp->dp; 123 while(nbits--){ 124 __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; 125 void *tmp; 126 int i; 127 128 /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ 129 sym0v = _mm_set1_pi16(syms[0]); 130 sym1v = _mm_set1_pi16(syms[1]); 131 sym2v = _mm_set1_pi16(syms[2]); 132 sym3v = _mm_set1_pi16(syms[3]); 133 sym4v = _mm_set1_pi16(syms[4]); 134 sym5v = _mm_set1_pi16(syms[5]); 135 syms += 6; 136 137 for(i=0;i<2048;i++){ 138 __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; 139 140 /* Form branch metrics 141 * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, 142 * the XOR operations constitute conditional negation. 143 * metric and m_metric (-metric) are in the range 0-1530 144 */ 145 m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v)); 146 m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v)); 147 m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v)); 148 metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2)); 149 m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric); 150 151 /* Add branch metrics to path metrics */ 152 m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric); 153 m3 = _mm_adds_pi16(vp->old_metrics->v[2048+i],metric); 154 m1 = _mm_adds_pi16(vp->old_metrics->v[2048+i],m_metric); 155 m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric); 156 157 /* Compare and select */ 158 survivor0 = _mm_min_pi16(m0,m1); 159 survivor1 = _mm_min_pi16(m2,m3); 160 decision0 = _mm_cmpeq_pi16(survivor0,m1); 161 decision1 = _mm_cmpeq_pi16(survivor1,m3); 162 163 /* Pack decisions into 8 bits and store */ 164 d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()))); 165 166 /* Store surviving metrics */ 167 vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); 168 vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); 169 } 170 /* See if we need to renormalize 171 * Max metric spread for this code with 0-255 branch metrics is 12750 172 */ 173 if(vp->new_metrics->s[0] >= SHRT_MAX-12750){ 174 int i,adjust; 175 __m64 adjustv; 176 union { __m64 v; signed short w[4]; } t; 177 178 /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ 179 adjustv = vp->new_metrics->v[0]; 180 for(i=1;i<4096;i++) 181 adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]); 182 183 adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32)); 184 adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16)); 185 t.v = adjustv; 186 adjust = t.w[0] - SHRT_MIN; 187 adjustv = _mm_set1_pi16(adjust); 188 189 for(i=0;i<4096;i++) 190 vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv); 191 } 192 d++; 193 /* Swap pointers to old and new metrics */ 194 tmp = vp->old_metrics; 195 vp->old_metrics = vp->new_metrics; 196 vp->new_metrics = tmp; 197 } 198 vp->dp = d; 199 _mm_empty(); 200 return 0; 201 } 202