      1 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
      3 declare <4 x float> @do_sse(<4 x float>)
      4 declare <8 x float> @do_avx(<8 x float>)
      5 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
      6 @x = common global <4 x float> zeroinitializer, align 16
      7 @g = common global <8 x float> zeroinitializer, align 32
      9 ;; Basic checking - don't emit any vzeroupper instruction
     11 ; CHECK: _test00
     12 define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
     13 entry:
     14   ; CHECK-NOT: vzeroupper
     15   %add.i = fadd <4 x float> %a, %b
     16   %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
     17   ; CHECK: ret
     18   ret <4 x float> %call3
     19 }
     21 ;; Check parameter 256-bit parameter passing
     23 ; CHECK: _test01
     24 define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {
     25 entry:
     26   %tmp = load <4 x float>* @x, align 16
     27   ; CHECK: vzeroupper
     28   ; CHECK-NEXT: callq _do_sse
     29   %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
     30   store <4 x float> %call, <4 x float>* @x, align 16
     31   ; CHECK-NOT: vzeroupper
     32   ; CHECK: callq _do_sse
     33   %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
     34   store <4 x float> %call2, <4 x float>* @x, align 16
     35   ; CHECK: ret
     36   ret <8 x float> %c
     37 }
     39 ;; Test the pass convergence and also that vzeroupper is only issued when necessary,
     40 ;; for this function it should be only once
     42 ; CHECK: _test02
     43 define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
     44 entry:
     45   %add.i = fadd <4 x float> %a, %b
     46   br label %for.body
     48 for.body:                                         ; preds = %for.body, %entry
     49   ; CHECK: LBB
     50   ; CHECK-NOT: vzeroupper
     51   %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ]
     52   %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ]
     53   ; CHECK: callq _do_sse
     54   %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
     55   ; CHECK-NEXT: callq _do_sse
     56   %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
     57   %tmp11 = load <8 x float>* @g, align 32
     58   %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
     59   ; CHECK: vzeroupper
     60   ; CHECK-NEXT: callq _do_sse
     61   %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
     62   %1 = add nsw i32 %i.018, 1
     63   %exitcond = icmp eq i32 %1, 4
     64   br i1 %exitcond, label %for.end, label %for.body
     66 for.end:                                          ; preds = %for.body
     67   ret <4 x float> %call14
     68 }
     70 ;; Check that we also perform vzeroupper when we return from a function.
     72 ; CHECK: _test03
     73 define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
     74 entry:
     75   %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
     76   ; CHECK-NOT: vzeroupper
     77   ; CHECK: call
     78   %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
     79   %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
     80   ; CHECK: vzeroupper
     81   ; CHECK: ret
     82   ret <4 x float> %shuf2
     83 }