Home | History | Annotate | Download | only in fec
      1 # SIMD SSE2 dot product
      2 # Equivalent to the following C code:
      3 # long dotprod(signed short *a,signed short *b,int cnt)
      4 # {
      5 #	long sum = 0;
      6 #	cnt *= 8;
      7 #	while(cnt--)
      8 #		sum += *a++ + *b++;
      9 #	return sum;
     10 # }
     11 # a and b must be 128-bit aligned
     12 # Copyright 2001, Phil Karn KA9Q
     13 # May be used under the terms of the GNU Lesser General Public License (LGPL)
     14 
     15 	.text
     16 	.global dotprod_sse2_assist
     17 	.type dotprod_sse2_assist,@function
     18 dotprod_sse2_assist:
     19 	pushl %ebp
     20 	movl %esp,%ebp
     21 	pushl %esi
     22 	pushl %edi
     23 	pushl %ecx
     24 	pushl %ebx
     25 	movl 8(%ebp),%esi	# a
     26 	movl 12(%ebp),%edi	# b
     27 	movl 16(%ebp),%ecx	# cnt
     28 	pxor %xmm0,%xmm0		# clear running sum (in two 32-bit halves)
     29 
     30 # SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop
     31 	.align 16
     32 .Loop1:	subl $4,%ecx
     33 	jl   .Loop1Done
     34 
     35 	movdqa (%esi),%xmm1
     36  	pmaddwd (%edi),%xmm1
     37 	paddd %xmm1,%xmm0
     38 
     39 	movdqa 16(%esi),%xmm1
     40 	pmaddwd 16(%edi),%xmm1
     41 	paddd %xmm1,%xmm0
     42 
     43 	movdqa 32(%esi),%xmm1
     44 	pmaddwd 32(%edi),%xmm1
     45 	paddd %xmm1,%xmm0
     46 
     47 	movdqa 48(%esi),%xmm1
     48 	addl $64,%esi
     49 	pmaddwd 48(%edi),%xmm1
     50 	addl $64,%edi
     51 	paddd %xmm1,%xmm0
     52 
     53 	jmp .Loop1
     54 .Loop1Done:
     55 
     56 	addl $4,%ecx
     57 
     58 # SSE2 dot product loop, not unrolled, crunching 4 terms per loop
     59 # This could be redone as Duff's Device on the unrolled loop above
     60 .Loop2:	subl $1,%ecx
     61 	jl   .Loop2Done
     62 
     63 	movdqa (%esi),%xmm1
     64 	addl $16,%esi
     65 	pmaddwd (%edi),%xmm1
     66 	addl $16,%edi
     67 	paddd %xmm1,%xmm0
     68 	jmp .Loop2
     69 .Loop2Done:
     70 
     71 	movdqa %xmm0,%xmm1
     72 	psrldq $8,%xmm0
     73 	paddd %xmm1,%xmm0
     74 	movd %xmm0,%eax		# right-hand word to eax
     75 	psrldq $4,%xmm0
     76 	movd %xmm0,%ebx
     77 	addl %ebx,%eax
     78 
     79 	popl %ebx
     80 	popl %ecx
     81 	popl %edi
     82 	popl %esi
     83 	movl %ebp,%esp
     84 	popl %ebp
     85 	ret
     86