1 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s 2 3 declare <4 x float> @do_sse(<4 x float>) 4 declare <8 x float> @do_avx(<8 x float>) 5 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone 6 @x = common global <4 x float> zeroinitializer, align 16 7 @g = common global <8 x float> zeroinitializer, align 32 8 9 ;; Basic checking - don't emit any vzeroupper instruction 10 11 ; CHECK: _test00 12 define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { 13 entry: 14 ; CHECK-NOT: vzeroupper 15 %add.i = fadd <4 x float> %a, %b 16 %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind 17 ; CHECK: ret 18 ret <4 x float> %call3 19 } 20 21 ;; Check parameter 256-bit parameter passing 22 23 ; CHECK: _test01 24 define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp { 25 entry: 26 %tmp = load <4 x float>* @x, align 16 27 ; CHECK: vzeroupper 28 ; CHECK-NEXT: callq _do_sse 29 %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind 30 store <4 x float> %call, <4 x float>* @x, align 16 31 ; CHECK-NOT: vzeroupper 32 ; CHECK: callq _do_sse 33 %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind 34 store <4 x float> %call2, <4 x float>* @x, align 16 35 ; CHECK: ret 36 ret <8 x float> %c 37 } 38 39 ;; Test the pass convergence and also that vzeroupper is only issued when necessary, 40 ;; for this function it should be only once 41 42 ; CHECK: _test02 43 define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { 44 entry: 45 %add.i = fadd <4 x float> %a, %b 46 br label %for.body 47 48 for.body: ; preds = %for.body, %entry 49 ; CHECK: LBB 50 ; CHECK-NOT: vzeroupper 51 %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ] 52 %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ] 53 ; CHECK: callq _do_sse 54 %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind 55 ; CHECK-NEXT: callq _do_sse 56 %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind 57 %tmp11 = load <8 x float>* @g, align 32 58 %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind 59 ; CHECK: vzeroupper 60 ; CHECK-NEXT: callq _do_sse 61 %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind 62 %1 = add nsw i32 %i.018, 1 63 %exitcond = icmp eq i32 %1, 4 64 br i1 %exitcond, label %for.end, label %for.body 65 66 for.end: ; preds = %for.body 67 ret <4 x float> %call14 68 } 69 70 ;; Check that we also perform vzeroupper when we return from a function. 71 72 ; CHECK: _test03 73 define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { 74 entry: 75 %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 76 ; CHECK-NOT: vzeroupper 77 ; CHECK: call 78 %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind 79 %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 80 ; CHECK: vzeroupper 81 ; CHECK: ret 82 ret <4 x float> %shuf2 83 } 84