Home | History | Annotate | Download | only in fec
      1 # SIMD MMX dot product
      2 # Equivalent to the following C code:
      3 # long dotprod(signed short *a,signed short *b,int cnt)
      4 # {
      5 #	long sum = 0;
      6 #	cnt *= 4;
      7 #	while(cnt--)
      8 #		sum += *a++ + *b++;
      9 #	return sum;
     10 # }
     11 # a and b should also be 64-bit aligned, or speed will suffer greatly
     12 # Copyright 1999, Phil Karn KA9Q
     13 # May be used under the terms of the GNU Lesser General Public License (LGPL)
     14 
     15 	.text
     16 	.global dotprod_mmx_assist
     17 	.type dotprod_mmx_assist,@function
     18 dotprod_mmx_assist:
     19 	pushl %ebp
     20 	movl %esp,%ebp
     21 	pushl %esi
     22 	pushl %edi
     23 	pushl %ecx
     24 	pushl %ebx
     25 	movl 8(%ebp),%esi	# a
     26 	movl 12(%ebp),%edi	# b
     27 	movl 16(%ebp),%ecx	# cnt
     28 	pxor %mm0,%mm0		# clear running sum (in two 32-bit halves)
     29 
     30 # MMX dot product loop unrolled 4 times, crunching 16 terms per loop
     31 	.align 16
     32 .Loop1:	subl $4,%ecx
     33 	jl   .Loop1Done
     34 
     35 	movq (%esi),%mm1	# mm1 = a[3],a[2],a[1],a[0]
     36  	pmaddwd (%edi),%mm1	# mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0]
     37 	paddd %mm1,%mm0
     38 
     39 	movq 8(%esi),%mm1
     40 	pmaddwd 8(%edi),%mm1
     41 	paddd %mm1,%mm0
     42 
     43 	movq 16(%esi),%mm1
     44 	pmaddwd 16(%edi),%mm1
     45 	paddd %mm1,%mm0
     46 
     47 	movq 24(%esi),%mm1
     48 	addl $32,%esi
     49 	pmaddwd 24(%edi),%mm1
     50 	addl $32,%edi
     51 	paddd %mm1,%mm0
     52 
     53 	jmp .Loop1
     54 .Loop1Done:
     55 
     56 	addl $4,%ecx
     57 
     58 # MMX dot product loop, not unrolled, crunching 4 terms per loop
     59 # This could be redone as Duff's Device on the unrolled loop above
     60 .Loop2:	subl $1,%ecx
     61 	jl   .Loop2Done
     62 
     63 	movq (%esi),%mm1
     64 	addl $8,%esi
     65 	pmaddwd (%edi),%mm1
     66 	addl $8,%edi
     67 	paddd %mm1,%mm0
     68 	jmp .Loop2
     69 .Loop2Done:
     70 
     71 	movd %mm0,%ebx		# right-hand word to ebx
     72 	punpckhdq %mm0,%mm0	# left-hand word to right side of %mm0
     73 	movd %mm0,%eax
     74 	addl %ebx,%eax		# running sum now in %eax
     75 	emms			# done with MMX
     76 
     77 	popl %ebx
     78 	popl %ecx
     79 	popl %edi
     80 	popl %esi
     81 	movl %ebp,%esp
     82 	popl %ebp
     83 	ret
     84