Home | History | Annotate | Download | only in i386
      1 // This file is dual licensed under the MIT and the University of Illinois Open
      2 // Source Licenses. See LICENSE.TXT for details.
      3 
      4 #include "../assembly.h"
      5 
      6 // float __floatundisf(du_int a);
      7 
      8 // Note that there is a hardware instruction, fildll, that does most of what
      9 // this function needs to do.  However, because of our ia32 ABI, it will take
     10 // a write-small read-large stall, so the software implementation here is
     11 // actually several cycles faster.
     12 
     13 // This is a branch-free implementation.  A branchy implementation might be
     14 // faster for the common case if you know something a priori about the input
     15 // distribution.
     16 
     17 /* branch-free x87 implementation - one cycle slower than without x87.
     18 
     19 #ifdef __i386__
     20 
     21 CONST_SECTION
     22 .balign 3
     23 
     24 		.quad	0x43f0000000000000
     25 twop64:	.quad	0x0000000000000000
     26 
     27 #define			TWOp64			twop64-0b(%ecx,%eax,8)
     28 
     29 .text
     30 .balign 4
     31 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
     32 	movl		8(%esp),		%eax
     33 	movd		8(%esp),		%xmm1
     34 	movd		4(%esp),		%xmm0
     35 	punpckldq	%xmm1,			%xmm0
     36 	calll		0f
     37 0:	popl		%ecx
     38 	sarl		$31,			%eax
     39 	movq		%xmm0,			4(%esp)
     40 	fildll		4(%esp)
     41 	faddl		TWOp64
     42 	fstps		4(%esp)
     43 	flds		4(%esp)
     44 	ret
     45 END_COMPILERRT_FUNCTION(__floatundisf)
     46 
     47 #endif // __i386__
     48 
     49 */
     50 
     51 /* branch-free, x87-free implementation - faster at the expense of code size */
     52 
     53 #ifdef __i386__
     54 
     55 CONST_SECTION
     56 
     57 	.balign 16
     58 twop52:
     59 	.quad 0x4330000000000000
     60 	.quad 0x0000000000000fff
     61 
     62 	.balign 16
     63 sticky:
     64 	.quad 0x0000000000000000
     65 	.long 0x00000012
     66 
     67 	.balign 16
     68 twelve:
     69 	.long 0x00000000
     70 
     71 #define			TWOp52			twop52-0b(%ecx)
     72 #define			STICKY			sticky-0b(%ecx,%eax,8)
     73 
     74 .text
     75 .balign 4
     76 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
     77 	movl		8(%esp),		%eax
     78 	movd		8(%esp),		%xmm1
     79 	movd		4(%esp),		%xmm0
     80 	punpckldq	%xmm1,			%xmm0
     81 
     82 	calll		0f
     83 0:	popl		%ecx
     84 	shrl		%eax					// high 31 bits of input as sint32
     85 	addl		$0x7ff80000,	%eax
     86 	sarl		$31,			%eax	// (big input) ? -1 : 0
     87 	movsd		STICKY,			%xmm1	// (big input) ? 0xfff : 0
     88 	movl		$12,			%edx
     89 	andl		%eax,			%edx	// (big input) ? 12 : 0
     90 	movd		%edx,			%xmm3
     91 	andpd		%xmm0,			%xmm1	// (big input) ? input & 0xfff : 0
     92 	movsd		TWOp52,			%xmm2	// 0x1.0p52
     93 	psrlq		%xmm3,			%xmm0	// (big input) ? input >> 12 : input
     94 	orpd		%xmm2,			%xmm1	// 0x1.0p52 + ((big input) ? input & 0xfff : input)
     95 	orpd		%xmm1,			%xmm0	// 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input)
     96 	subsd		%xmm2,			%xmm0	// (double)((big input) ? (input >> 12 | input & 0xfff) : input)
     97 	cvtsd2ss	%xmm0,			%xmm0	// (float)((big input) ? (input >> 12 | input & 0xfff) : input)
     98 	pslld		$23,			%xmm3
     99 	paddd		%xmm3,			%xmm0	// (float)input
    100 	movd		%xmm0,			4(%esp)
    101 	flds		4(%esp)
    102 	ret
    103 END_COMPILERRT_FUNCTION(__floatundisf)
    104 
    105 #endif // __i386__
    106