1 # SSE2 assist routines for sumsq 2 # Copyright 2001 Phil Karn, KA9Q 3 # May be used under the terms of the GNU Public License (GPL) 4 5 .text 6 # Evaluate sum of squares of signed 16-bit input samples 7 # long long sumsq_sse2_assist(signed short *in,int cnt); 8 .global sumsq_sse2_assist 9 .type sumsq_sse2_assist,@function 10 .align 16 11 sumsq_sse2_assist: 12 pushl %ebp 13 movl %esp,%ebp 14 pushl %esi 15 pushl %ecx 16 17 movl 8(%ebp),%esi 18 movl 12(%ebp),%ecx 19 pxor %xmm2,%xmm2 # zero sum 20 movaps low,%xmm3 # load mask 21 22 1: subl $8,%ecx 23 jl 2f 24 movaps (%esi),%xmm0 # S0 S1 S2 S3 S4 S5 S6 S7 25 pmaddwd %xmm0,%xmm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7) 26 movaps %xmm0,%xmm1 27 pand %xmm3,%xmm1 # (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0 28 paddq %xmm1,%xmm2 # sum even-numbered dwords 29 psrlq $32,%xmm0 # (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0 30 paddq %xmm0,%xmm2 # sum odd-numbered dwords 31 addl $16,%esi 32 jmp 1b 33 34 2: movaps %xmm2,%xmm0 35 psrldq $8,%xmm0 36 paddq %xmm2,%xmm0 # combine 64-bit sums 37 38 movd %xmm0,%eax # low 32 bits of sum 39 psrldq $4,%xmm0 40 movd %xmm0,%edx # high 32 bits of sum 41 42 popl %ecx 43 popl %esi 44 popl %ebp 45 ret 46 47 .data 48 .align 16 49 low: .byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0 50