1 //===---------------------------------------------------------------------===// 2 // Random ideas for the X86 backend: MMX-specific stuff. 3 //===---------------------------------------------------------------------===// 4 5 //===---------------------------------------------------------------------===// 6 7 This: 8 9 #include <mmintrin.h> 10 11 __v2si qux(int A) { 12 return (__v2si){ 0, A }; 13 } 14 15 is compiled into: 16 17 _qux: 18 subl $28, %esp 19 movl 32(%esp), %eax 20 movd %eax, %mm0 21 movq %mm0, (%esp) 22 movl (%esp), %eax 23 movl %eax, 20(%esp) 24 movq %mm0, 8(%esp) 25 movl 12(%esp), %eax 26 movl %eax, 16(%esp) 27 movq 16(%esp), %mm0 28 addl $28, %esp 29 ret 30 31 Yuck! 32 33 GCC gives us: 34 35 _qux: 36 subl $12, %esp 37 movl 16(%esp), %eax 38 movl 20(%esp), %edx 39 movl $0, (%eax) 40 movl %edx, 4(%eax) 41 addl $12, %esp 42 ret $4 43 44 //===---------------------------------------------------------------------===// 45 46 We generate crappy code for this: 47 48 __m64 t() { 49 return _mm_cvtsi32_si64(1); 50 } 51 52 _t: 53 subl $12, %esp 54 movl $1, %eax 55 movd %eax, %mm0 56 movq %mm0, (%esp) 57 movl (%esp), %eax 58 movl 4(%esp), %edx 59 addl $12, %esp 60 ret 61 62 The extra stack traffic is covered in the previous entry. But the other reason 63 is we are not smart about materializing constants in MMX registers. With -m64 64 65 movl $1, %eax 66 movd %eax, %mm0 67 movd %mm0, %rax 68 ret 69 70 We should be using a constantpool load instead: 71 movq LC0(%rip), %rax 72