Home | History | Annotate | Download | only in fec
      1 /* 16-bit signed integer dot product
      2  * Altivec-assisted version
      3  * Copyright 2004 Phil Karn
      4  * May be used under the terms of the GNU Lesser General Public License (LGPL)
      5  */
      6 #include <stdlib.h>
      7 #include "fec.h"
      8 
      9 struct dotprod {
     10   int len; /* Number of coefficients */
     11 
     12   /* On an Altivec machine, these hold 8 copies of the coefficients,
     13    * preshifted by 0,1,..7 words to meet all possible input data
     14    */
     15   signed short *coeffs[8];
     16 };
     17 
     18 /* Create and return a descriptor for use with the dot product function */
     19 void *initdp_av(signed short coeffs[],int len){
     20   struct dotprod *dp;
     21   int i,j;
     22 
     23   if(len == 0)
     24     return NULL;
     25 
     26   dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
     27   dp->len = len;
     28 
     29   /* Make 8 copies of coefficients, one for each data alignment,
     30    * each aligned to 16-byte boundary
     31    */
     32   for(i=0;i<8;i++){
     33     dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
     34     for(j=0;j<len;j++)
     35       dp->coeffs[i][j+i] = coeffs[j];
     36   }
     37   return (void *)dp;
     38 }
     39 
     40 
     41 /* Free a dot product descriptor created earlier */
     42 void freedp_av(void *p){
     43   struct dotprod *dp = (struct dotprod *)p;
     44   int i;
     45 
     46   for(i=0;i<8;i++)
     47     if(dp->coeffs[i] != NULL)
     48       free(dp->coeffs[i]);
     49   free(dp);
     50 }
     51 
     52 /* Compute a dot product given a descriptor and an input array
     53  * The length is taken from the descriptor
     54  */
     55 long dotprod_av(void *p,signed short a[]){
     56   struct dotprod *dp = (struct dotprod *)p;
     57   int al;
     58   vector signed short *ar,*d;
     59   vector signed int sums0,sums1,sums2,sums3;
     60   union { vector signed int v; signed int w[4];} s;
     61   int nblocks;
     62 
     63   /* round ar down to beginning of 16-byte block containing 0th element of
     64    * input buffer. Then set d to one of 8 sets of shifted coefficients
     65    */
     66   ar = (vector signed short *)((int)a & ~15);
     67   al = ((int)a & 15)/sizeof(signed short);
     68   d = (vector signed short *)dp->coeffs[al];
     69 
     70   nblocks = (dp->len+al-1)/8+1;
     71 
     72   /* Sum into four vectors each holding four 32-bit partial sums */
     73   sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
     74   while(nblocks >= 4){
     75     sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
     76     sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
     77     sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
     78     sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
     79     nblocks -= 4;
     80   }
     81   sums0 = vec_adds(sums0,sums1);
     82   sums2 = vec_adds(sums2,sums3);
     83   sums0 = vec_adds(sums0,sums2);
     84   while(nblocks-- > 0){
     85     sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
     86   }
     87   /* Sum 4 partial sums into final result */
     88   s.v = vec_sums(sums0,(vector signed int)(0));
     89 
     90   return s.w[3];
     91 }
     92 
     93 
     94