1 /* 16-bit signed integer dot product 2 * Altivec-assisted version 3 * Copyright 2004 Phil Karn 4 * May be used under the terms of the GNU Lesser General Public License (LGPL) 5 */ 6 #include <stdlib.h> 7 #include "fec.h" 8 9 struct dotprod { 10 int len; /* Number of coefficients */ 11 12 /* On an Altivec machine, these hold 8 copies of the coefficients, 13 * preshifted by 0,1,..7 words to meet all possible input data 14 */ 15 signed short *coeffs[8]; 16 }; 17 18 /* Create and return a descriptor for use with the dot product function */ 19 void *initdp_av(signed short coeffs[],int len){ 20 struct dotprod *dp; 21 int i,j; 22 23 if(len == 0) 24 return NULL; 25 26 dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); 27 dp->len = len; 28 29 /* Make 8 copies of coefficients, one for each data alignment, 30 * each aligned to 16-byte boundary 31 */ 32 for(i=0;i<8;i++){ 33 dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short)); 34 for(j=0;j<len;j++) 35 dp->coeffs[i][j+i] = coeffs[j]; 36 } 37 return (void *)dp; 38 } 39 40 41 /* Free a dot product descriptor created earlier */ 42 void freedp_av(void *p){ 43 struct dotprod *dp = (struct dotprod *)p; 44 int i; 45 46 for(i=0;i<8;i++) 47 if(dp->coeffs[i] != NULL) 48 free(dp->coeffs[i]); 49 free(dp); 50 } 51 52 /* Compute a dot product given a descriptor and an input array 53 * The length is taken from the descriptor 54 */ 55 long dotprod_av(void *p,signed short a[]){ 56 struct dotprod *dp = (struct dotprod *)p; 57 int al; 58 vector signed short *ar,*d; 59 vector signed int sums0,sums1,sums2,sums3; 60 union { vector signed int v; signed int w[4];} s; 61 int nblocks; 62 63 /* round ar down to beginning of 16-byte block containing 0th element of 64 * input buffer. Then set d to one of 8 sets of shifted coefficients 65 */ 66 ar = (vector signed short *)((int)a & ~15); 67 al = ((int)a & 15)/sizeof(signed short); 68 d = (vector signed short *)dp->coeffs[al]; 69 70 nblocks = (dp->len+al-1)/8+1; 71 72 /* Sum into four vectors each holding four 32-bit partial sums */ 73 sums3 = sums2 = sums1 = sums0 = (vector signed int)(0); 74 while(nblocks >= 4){ 75 sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0); 76 sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1); 77 sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2); 78 sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3); 79 nblocks -= 4; 80 } 81 sums0 = vec_adds(sums0,sums1); 82 sums2 = vec_adds(sums2,sums3); 83 sums0 = vec_adds(sums0,sums2); 84 while(nblocks-- > 0){ 85 sums0 = vec_msums(ar[nblocks],d[nblocks],sums0); 86 } 87 /* Sum 4 partial sums into final result */ 88 s.v = vec_sums(sums0,(vector signed int)(0)); 89 90 return s.w[3]; 91 } 92 93 94