Home | History | Annotate | Download | only in i386
      1 // This file is dual licensed under the MIT and the University of Illinois Open
      2 // Source Licenses. See LICENSE.TXT for details.
      3 
      4 #include "../assembly.h"
      5 
      6 // float __floatundisf(du_int a);
      7 
      8 // Note that there is a hardware instruction, fildll, that does most of what
      9 // this function needs to do.  However, because of our ia32 ABI, it will take
     10 // a write-small read-large stall, so the software implementation here is
     11 // actually several cycles faster.
     12 
     13 // This is a branch-free implementation.  A branchy implementation might be
     14 // faster for the common case if you know something a priori about the input
     15 // distribution.
     16 
     17 /* branch-free x87 implementation - one cycle slower than without x87.
     18 
     19 #ifdef __i386__
     20 
     21 .const
     22 .align 3
     23 
     24 		.quad	0x43f0000000000000
     25 twop64:	.quad	0x0000000000000000
     26 
     27 #define			TWOp64			twop64-0b(%ecx,%eax,8)
     28 
     29 .text
     30 .align 4
     31 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
     32 	movl		8(%esp),		%eax
     33 	movd		8(%esp),		%xmm1
     34 	movd		4(%esp),		%xmm0
     35 	punpckldq	%xmm1,			%xmm0
     36 	calll		0f
     37 0:	popl		%ecx
     38 	sarl		$31,			%eax
     39 	movq		%xmm0,			4(%esp)
     40 	fildll		4(%esp)
     41 	faddl		TWOp64
     42 	fstps		4(%esp)
     43 	flds		4(%esp)
     44 	ret
     45 
     46 #endif // __i386__
     47 
     48 */
     49 
     50 /* branch-free, x87-free implementation - faster at the expense of code size */
     51 
     52 #ifdef __i386__
     53 
     54 #ifndef __ELF__
     55 .const
     56 .align 3
     57 #else
     58 .align 8
     59 #endif
     60 twop52: .quad 0x4330000000000000
     61 		.quad 0x0000000000000fff
     62 sticky: .quad 0x0000000000000000
     63 		.long 0x00000012
     64 twelve:	.long 0x00000000
     65 
     66 #define			TWOp52			twop52-0b(%ecx)
     67 #define			STICKY			sticky-0b(%ecx,%eax,8)
     68 
     69 .text
     70 .align 4
     71 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
     72 	movl		8(%esp),		%eax
     73 	movd		8(%esp),		%xmm1
     74 	movd		4(%esp),		%xmm0
     75 	punpckldq	%xmm1,			%xmm0
     76 
     77 	calll		0f
     78 0:	popl		%ecx
     79 	shrl		%eax					// high 31 bits of input as sint32
     80 	addl		$0x7ff80000,	%eax
     81 	sarl		$31,			%eax	// (big input) ? -1 : 0
     82 	movsd		STICKY,			%xmm1	// (big input) ? 0xfff : 0
     83 	movl		$12,			%edx
     84 	andl		%eax,			%edx	// (big input) ? 12 : 0
     85 	movd		%edx,			%xmm3
     86 	andpd		%xmm0,			%xmm1	// (big input) ? input & 0xfff : 0
     87 	movsd		TWOp52,			%xmm2	// 0x1.0p52
     88 	psrlq		%xmm3,			%xmm0	// (big input) ? input >> 12 : input
     89 	orpd		%xmm2,			%xmm1	// 0x1.0p52 + ((big input) ? input & 0xfff : input)
     90 	orpd		%xmm1,			%xmm0	// 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input)
     91 	subsd		%xmm2,			%xmm0	// (double)((big input) ? (input >> 12 | input & 0xfff) : input)
     92 	cvtsd2ss	%xmm0,			%xmm0	// (float)((big input) ? (input >> 12 | input & 0xfff) : input)
     93 	pslld		$23,			%xmm3
     94 	paddd		%xmm3,			%xmm0	// (float)input
     95 	movd		%xmm0,			4(%esp)
     96 	flds		4(%esp)
     97 	ret
     98 
     99 #endif // __i386__
    100