Home | History | Annotate | Download | only in fec
      1 # SSE2 assist routines for sumsq
      2 # Copyright 2001 Phil Karn, KA9Q
      3 # May be used under the terms of the GNU Public License (GPL)
      4 
      5 	.text
      6 # Evaluate sum of squares of signed 16-bit input samples
      7 #  long long sumsq_sse2_assist(signed short *in,int cnt);
      8 	.global sumsq_sse2_assist
      9 	.type sumsq_sse2_assist,@function
     10 	.align 16
     11 sumsq_sse2_assist:
     12 	pushl %ebp
     13 	movl %esp,%ebp
     14 	pushl %esi
     15 	pushl %ecx
     16 
     17 	movl 8(%ebp),%esi
     18 	movl 12(%ebp),%ecx
     19 	pxor %xmm2,%xmm2		# zero sum
     20 	movaps low,%xmm3		# load mask
     21 
     22 1:	subl $8,%ecx
     23 	jl 2f
     24 	movaps (%esi),%xmm0	# S0 S1 S2 S3 S4 S5 S6 S7
     25 	pmaddwd %xmm0,%xmm0	# (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7)
     26 	movaps %xmm0,%xmm1
     27 	pand %xmm3,%xmm1	# (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0
     28 	paddq %xmm1,%xmm2	# sum even-numbered dwords
     29 	psrlq $32,%xmm0		# (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0
     30 	paddq %xmm0,%xmm2	# sum odd-numbered dwords
     31 	addl $16,%esi
     32 	jmp 1b
     33 
     34 2:	movaps %xmm2,%xmm0
     35 	psrldq $8,%xmm0
     36 	paddq %xmm2,%xmm0	# combine 64-bit sums
     37 
     38 	movd %xmm0,%eax		# low 32 bits of sum
     39 	psrldq $4,%xmm0
     40 	movd %xmm0,%edx		# high 32 bits of sum
     41 
     42 	popl %ecx
     43 	popl %esi
     44 	popl %ebp
     45 	ret
     46 
     47 	.data
     48 	.align 16
     49 low:	.byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0
     50