1 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic |& FileCheck %s 2 ; CHECK: fail 3 ; CHECK-NOT: fail 4 5 declare float @test_f(float %f) 6 declare double @test_d(double %f) 7 declare <4 x float> @test_vf(<4 x float> %f) 8 declare <2 x double> @test_vd(<2 x double> %f) 9 declare float @llvm.sqrt.f32(float) 10 declare double @llvm.sqrt.f64(double) 11 12 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) 13 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) 14 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) 15 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) 16 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) 17 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) 18 declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) 19 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) 20 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) 21 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) 22 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) 23 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) 24 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) 25 declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) 26 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) 27 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) 28 29 define float @foo(float %f) { 30 %a = call float @test_f(float %f) 31 %t = call float @llvm.sqrt.f32(float %f) 32 ret float %t 33 } 34 define double @doo(double %f) { 35 %a = call double @test_d(double %f) 36 %t = call double @llvm.sqrt.f64(double %f) 37 ret double %t 38 } 39 define <4 x float> @a0(<4 x float> %f) { 40 %a = call <4 x float> @test_vf(<4 x float> %f) 41 %t = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %f) 42 ret <4 x float> %t 43 } 44 define <4 x float> @a1(<4 x float> %f) { 45 %a = call <4 x float> @test_vf(<4 x float> %f) 46 %t = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %f) 47 ret <4 x float> %t 48 } 49 define <4 x float> @a2(<4 x float> %f) { 50 %a = call <4 x float> @test_vf(<4 x float> %f) 51 %t = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %f) 52 ret <4 x float> %t 53 } 54 define <4 x float> @b3(<4 x float> %f) { 55 %y = call <4 x float> @test_vf(<4 x float> %f) 56 %t = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %y, <4 x float> %f) 57 ret <4 x float> %t 58 } 59 define <4 x float> @b4(<4 x float> %f) { 60 %y = call <4 x float> @test_vf(<4 x float> %f) 61 %t = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %y, <4 x float> %f) 62 ret <4 x float> %t 63 } 64 define <4 x float> @b5(<4 x float> %f) { 65 %y = call <4 x float> @test_vf(<4 x float> %f) 66 %t = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %y, <4 x float> %f, i8 7) 67 ret <4 x float> %t 68 } 69 define <4 x float> @b6(<4 x float> %f) { 70 %y = call <4 x float> @test_vf(<4 x float> %f) 71 %t = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %y, <4 x float> %f) 72 ret <4 x float> %t 73 } 74 define <4 x float> @b7(<4 x float> %f) { 75 %y = call <4 x float> @test_vf(<4 x float> %f) 76 %t = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %y, <4 x float> %f) 77 ret <4 x float> %t 78 } 79 define <4 x float> @b8(<4 x float> %f) { 80 %y = call <4 x float> @test_vf(<4 x float> %f) 81 %t = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %y, <4 x float> %f) 82 ret <4 x float> %t 83 } 84 define <2 x double> @c1(<2 x double> %f) { 85 %a = call <2 x double> @test_vd(<2 x double> %f) 86 %t = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %f) 87 ret <2 x double> %t 88 } 89 define <2 x double> @d3(<2 x double> %f) { 90 %y = call <2 x double> @test_vd(<2 x double> %f) 91 %t = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %y, <2 x double> %f) 92 ret <2 x double> %t 93 } 94 define <2 x double> @d4(<2 x double> %f) { 95 %y = call <2 x double> @test_vd(<2 x double> %f) 96 %t = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %y, <2 x double> %f) 97 ret <2 x double> %t 98 } 99 define <2 x double> @d5(<2 x double> %f) { 100 %y = call <2 x double> @test_vd(<2 x double> %f) 101 %t = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %y, <2 x double> %f, i8 7) 102 ret <2 x double> %t 103 } 104 define <2 x double> @d6(<2 x double> %f) { 105 %y = call <2 x double> @test_vd(<2 x double> %f) 106 %t = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %y, <2 x double> %f) 107 ret <2 x double> %t 108 } 109 define <2 x double> @d7(<2 x double> %f) { 110 %y = call <2 x double> @test_vd(<2 x double> %f) 111 %t = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %y, <2 x double> %f) 112 ret <2 x double> %t 113 } 114 define <2 x double> @d8(<2 x double> %f) { 115 %y = call <2 x double> @test_vd(<2 x double> %f) 116 %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %y, <2 x double> %f) 117 ret <2 x double> %t 118 } 119 120 ; This one should fail to fuse, but -regalloc=greedy isn't even trying. Instead 121 ; it produces: 122 ; callq test_vd 123 ; movapd (%rsp), %xmm1 # 16-byte Reload 124 ; hsubpd %xmm0, %xmm1 125 ; movapd %xmm1, %xmm0 126 ; addq $24, %rsp 127 ; ret 128 ; RABasic still tries to fold this one. 129 130 define <2 x double> @z0(<2 x double> %f) { 131 %y = call <2 x double> @test_vd(<2 x double> %f) 132 %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %f, <2 x double> %y) 133 ret <2 x double> %t 134 } 135