1 # SIMD SSE2 dot product 2 # Equivalent to the following C code: 3 # long dotprod(signed short *a,signed short *b,int cnt) 4 # { 5 # long sum = 0; 6 # cnt *= 8; 7 # while(cnt--) 8 # sum += *a++ + *b++; 9 # return sum; 10 # } 11 # a and b must be 128-bit aligned 12 # Copyright 2001, Phil Karn KA9Q 13 # May be used under the terms of the GNU Lesser General Public License (LGPL) 14 15 .text 16 .global dotprod_sse2_assist 17 .type dotprod_sse2_assist,@function 18 dotprod_sse2_assist: 19 pushl %ebp 20 movl %esp,%ebp 21 pushl %esi 22 pushl %edi 23 pushl %ecx 24 pushl %ebx 25 movl 8(%ebp),%esi # a 26 movl 12(%ebp),%edi # b 27 movl 16(%ebp),%ecx # cnt 28 pxor %xmm0,%xmm0 # clear running sum (in two 32-bit halves) 29 30 # SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop 31 .align 16 32 .Loop1: subl $4,%ecx 33 jl .Loop1Done 34 35 movdqa (%esi),%xmm1 36 pmaddwd (%edi),%xmm1 37 paddd %xmm1,%xmm0 38 39 movdqa 16(%esi),%xmm1 40 pmaddwd 16(%edi),%xmm1 41 paddd %xmm1,%xmm0 42 43 movdqa 32(%esi),%xmm1 44 pmaddwd 32(%edi),%xmm1 45 paddd %xmm1,%xmm0 46 47 movdqa 48(%esi),%xmm1 48 addl $64,%esi 49 pmaddwd 48(%edi),%xmm1 50 addl $64,%edi 51 paddd %xmm1,%xmm0 52 53 jmp .Loop1 54 .Loop1Done: 55 56 addl $4,%ecx 57 58 # SSE2 dot product loop, not unrolled, crunching 4 terms per loop 59 # This could be redone as Duff's Device on the unrolled loop above 60 .Loop2: subl $1,%ecx 61 jl .Loop2Done 62 63 movdqa (%esi),%xmm1 64 addl $16,%esi 65 pmaddwd (%edi),%xmm1 66 addl $16,%edi 67 paddd %xmm1,%xmm0 68 jmp .Loop2 69 .Loop2Done: 70 71 movdqa %xmm0,%xmm1 72 psrldq $8,%xmm0 73 paddd %xmm1,%xmm0 74 movd %xmm0,%eax # right-hand word to eax 75 psrldq $4,%xmm0 76 movd %xmm0,%ebx 77 addl %ebx,%eax 78 79 popl %ebx 80 popl %ecx 81 popl %edi 82 popl %esi 83 movl %ebp,%esp 84 popl %ebp 85 ret 86