Home | History | Annotate | Download | only in fec
      1 /* K=15 r=1/6 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions
      2  * 8-bit offset-binary soft decision samples
      3  * Copyright Mar 2004, Phil Karn, KA9Q
      4  * May be used under the terms of the GNU Lesser General Public License (LGPL)
      5  */
      6 #include <stdio.h>
      7 #include <stdlib.h>
      8 #include <memory.h>
      9 #include <limits.h>
     10 #include "fec.h"
     11 
     12 typedef union { unsigned char c[128][16]; vector unsigned char v[128]; } decision_t;
     13 typedef union { unsigned short s[16384]; vector unsigned short v[2048]; } metric_t;
     14 
     15 static union branchtab615 { unsigned short s[8192]; vector unsigned short v[1024];} Branchtab615[6];
     16 static int Init = 0;
     17 
     18 /* State info for instance of Viterbi decoder */
     19 struct v615 {
     20   metric_t metrics1; /* path metric buffer 1 */
     21   metric_t metrics2; /* path metric buffer 2 */
     22   void *dp;          /* Pointer to current decision */
     23   metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
     24   void *decisions;   /* Beginning of decisions for block */
     25 };
     26 
     27 /* Initialize Viterbi decoder for start of new frame */
     28 int init_viterbi615_av(void *p,int starting_state){
     29   struct v615 *vp = p;
     30   int i;
     31 
     32   if(p == NULL)
     33     return -1;
     34 
     35   for(i=0;i<2048;i++)
     36     vp->metrics1.v[i] = (vector unsigned short)(5000);
     37 
     38   vp->old_metrics = &vp->metrics1;
     39   vp->new_metrics = &vp->metrics2;
     40   vp->dp = vp->decisions;
     41   vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */
     42   return 0;
     43 }
     44 
     45 /* Create a new instance of a Viterbi decoder */
     46 void *create_viterbi615_av(int len){
     47   struct v615 *vp;
     48 
     49   if(!Init){
     50     int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
     51     set_viterbi615_polynomial_av(polys);
     52   }
     53   vp = (struct v615 *)malloc(sizeof(struct v615));
     54   vp->decisions = malloc(sizeof(decision_t)*(len+14));
     55   init_viterbi615_av(vp,0);
     56   return vp;
     57 }
     58 
     59 void set_viterbi615_polynomial_av(int polys[6]){
     60   int state;
     61   int i;
     62 
     63   for(state=0;state < 8192;state++){
     64     for(i=0;i<6;i++)
     65       Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
     66   }
     67   Init++;
     68 }
     69 
     70 
     71 /* Viterbi chainback */
     72 int chainback_viterbi615_av(
     73       void *p,
     74       unsigned char *data, /* Decoded output data */
     75       unsigned int nbits, /* Number of data bits */
     76       unsigned int endstate){ /* Terminal encoder state */
     77   struct v615 *vp = p;
     78   decision_t *d = (decision_t *)vp->decisions;
     79   int path_metric;
     80 
     81   endstate %= 16384;
     82 
     83   path_metric = vp->old_metrics->s[endstate];
     84 
     85   /* The store into data[] only needs to be done every 8 bits.
     86    * But this avoids a conditional branch, and the writes will
     87    * combine in the cache anyway
     88    */
     89   d += 14; /* Look past tail */
     90   while(nbits-- != 0){
     91     int k;
     92 
     93     k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0;
     94     endstate = (k << 13) | (endstate >> 1);
     95     data[nbits>>3] = endstate >> 6;
     96   }
     97   return path_metric;
     98 }
     99 
    100 /* Delete instance of a Viterbi decoder */
    101 void delete_viterbi615_av(void *p){
    102   struct v615 *vp = p;
    103 
    104   if(vp != NULL){
    105     free(vp->decisions);
    106     free(vp);
    107   }
    108 }
    109 
    110 int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits){
    111   struct v615 *vp = p;
    112   decision_t *d = (decision_t *)vp->dp;
    113   int path_metric = 0;
    114   vector unsigned char decisions = (vector unsigned char)(0);
    115 
    116   while(nbits--){
    117     vector unsigned short symv,sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
    118     vector unsigned char s;
    119     void *tmp;
    120     int i;
    121 
    122     /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
    123     s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms));
    124 
    125     symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s);    /* Unsigned byte->word unpack */
    126     sym0v = vec_splat(symv,0);
    127     sym1v = vec_splat(symv,1);
    128     sym2v = vec_splat(symv,2);
    129     sym3v = vec_splat(symv,3);
    130     sym4v = vec_splat(symv,4);
    131     sym5v = vec_splat(symv,5);
    132     syms += 6;
    133 
    134     for(i=0;i<1024;i++){
    135       vector bool short decision0,decision1;
    136       vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
    137 
    138       /* Form branch metrics
    139        * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
    140        * the XOR operations constitute conditional negation.
    141        * metric and m_metric (-metric) are in the range 0-1530
    142        */
    143       m0 = vec_add(vec_xor(Branchtab615[0].v[i],sym0v),vec_xor(Branchtab615[1].v[i],sym1v));
    144       m1 = vec_add(vec_xor(Branchtab615[2].v[i],sym2v),vec_xor(Branchtab615[3].v[i],sym3v));
    145       m2 = vec_add(vec_xor(Branchtab615[4].v[i],sym4v),vec_xor(Branchtab615[5].v[i],sym5v));
    146       metric = vec_add(m0,m1);
    147       metric = vec_add(metric,m2);
    148       m_metric = vec_sub((vector unsigned short)(1530),metric);
    149 
    150       /* Add branch metrics to path metrics */
    151       m0 = vec_adds(vp->old_metrics->v[i],metric);
    152       m3 = vec_adds(vp->old_metrics->v[1024+i],metric);
    153       m1 = vec_adds(vp->old_metrics->v[1024+i],m_metric);
    154       m2 = vec_adds(vp->old_metrics->v[i],m_metric);
    155 
    156       /* Compare and select */
    157       decision0 = vec_cmpgt(m0,m1);
    158       decision1 = vec_cmpgt(m2,m3);
    159       survivor0 = vec_min(m0,m1);
    160       survivor1 = vec_min(m2,m3);
    161 
    162       /* Store decisions and survivors.
    163        * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in
    164        * a funny interleaved fashion that we undo in the chainback function.
    165        */
    166       decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */
    167 
    168       /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting
    169        * 0xff is equivalent to adding 1, which sets the lsb.
    170        */
    171       decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1)));
    172 
    173       vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1);
    174       vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1);
    175 
    176       if((i % 8) == 7){
    177 	/* We've accumulated a total of 128 decisions, stash and start again */
    178 	d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */
    179       }
    180     }
    181 #if 0
    182     /* Experimentally determine metric spread
    183      * The results are fixed for a given code and input symbol size
    184      */
    185     {
    186       int i;
    187       vector unsigned short min_metric;
    188       vector unsigned short max_metric;
    189       union { vector unsigned short v; unsigned short s[8];} t;
    190       int minimum,maximum;
    191       static int max_spread = 0;
    192 
    193       min_metric = max_metric = vp->new_metrics->v[0];
    194       for(i=1;i<2048;i++){
    195 	min_metric = vec_min(min_metric,vp->new_metrics->v[i]);
    196 	max_metric = vec_max(max_metric,vp->new_metrics->v[i]);
    197       }
    198       min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8));
    199       max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8));
    200       min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4));
    201       max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4));
    202       min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2));
    203       max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2));
    204 
    205       t.v = min_metric;
    206       minimum = t.s[0];
    207       t.v = max_metric;
    208       maximum = t.s[0];
    209       if(maximum-minimum > max_spread){
    210 	max_spread = maximum-minimum;
    211 	printf("metric spread = %d\n",max_spread);
    212       }
    213     }
    214 #endif
    215 
    216     /* Renormalize if necessary. This deserves some explanation.
    217 
    218      * The maximum possible spread, found by experiment, for 4-bit symbols is 405; for 8 bit symbols, it's 12750.
    219      * So by looking at one arbitrary metric we can tell if any of them have possibly saturated.
    220      * However, this is very conservative. Large spreads occur only at very high Eb/No, where
    221      * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor.
    222 
    223      * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric
    224      * by not not normalizing when we should are extremely low. So either way, the risk to performance is small.
    225 
    226      * All this is borne out by experiment.
    227      */
    228     if(vp->new_metrics->s[0] >= USHRT_MAX-12750){
    229       vector unsigned short scale;
    230       union { vector unsigned short v; unsigned short s[8];} t;
    231 
    232       /* Find smallest metric and splat */
    233       scale = vp->new_metrics->v[0];
    234       for(i=1;i<2048;i++)
    235 	scale = vec_min(scale,vp->new_metrics->v[i]);
    236 
    237       scale = vec_min(scale,vec_sld(scale,scale,8));
    238       scale = vec_min(scale,vec_sld(scale,scale,4));
    239       scale = vec_min(scale,vec_sld(scale,scale,2));
    240 
    241       /* Subtract it from all metrics
    242        * Work backwards to try to improve the cache hit ratio, assuming LRU
    243        */
    244       for(i=2047;i>=0;i--)
    245 	vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale);
    246       t.v = scale;
    247       path_metric += t.s[0];
    248     }
    249     d++;
    250     /* Swap pointers to old and new metrics */
    251     tmp = vp->old_metrics;
    252     vp->old_metrics = vp->new_metrics;
    253     vp->new_metrics = tmp;
    254   }
    255   vp->dp = d;
    256   return path_metric;
    257 }
    258