1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX 3 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512 4 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm 5 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2 6 7 declare i32 @foo() 8 declare <4 x float> @do_sse(<4 x float>) 9 declare <8 x float> @do_avx(<8 x float>) 10 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone 11 @x = common global <4 x float> zeroinitializer, align 16 12 @g = common global <8 x float> zeroinitializer, align 32 13 14 ;; Basic checking - don't emit any vzeroupper instruction 15 16 define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind { 17 ; ALL-LABEL: test00: 18 ; ALL: # %bb.0: 19 ; ALL-NEXT: pushq %rax 20 ; ALL-NEXT: vaddps %xmm1, %xmm0, %xmm0 21 ; ALL-NEXT: callq do_sse 22 ; ALL-NEXT: popq %rax 23 ; ALL-NEXT: retq 24 %add.i = fadd <4 x float> %a, %b 25 %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind 26 ret <4 x float> %call3 27 } 28 29 ;; Check parameter 256-bit parameter passing 30 31 define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind { 32 ; VZ-LABEL: test01: 33 ; VZ: # %bb.0: 34 ; VZ-NEXT: subq $56, %rsp 35 ; VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 36 ; VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 37 ; VZ-NEXT: vzeroupper 38 ; VZ-NEXT: callq do_sse 39 ; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) 40 ; VZ-NEXT: callq do_sse 41 ; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) 42 ; VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 43 ; VZ-NEXT: addq $56, %rsp 44 ; VZ-NEXT: retq 45 ; 46 ; FAST-ymm-zmm-LABEL: test01: 47 ; FAST-ymm-zmm: # %bb.0: 48 ; FAST-ymm-zmm-NEXT: subq $56, %rsp 49 ; FAST-ymm-zmm-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 50 ; FAST-ymm-zmm-NEXT: vmovaps {{.*}}(%rip), %xmm0 51 ; FAST-ymm-zmm-NEXT: callq do_sse 52 ; FAST-ymm-zmm-NEXT: vmovaps %xmm0, {{.*}}(%rip) 53 ; FAST-ymm-zmm-NEXT: callq do_sse 54 ; FAST-ymm-zmm-NEXT: vmovaps %xmm0, {{.*}}(%rip) 55 ; FAST-ymm-zmm-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 56 ; FAST-ymm-zmm-NEXT: addq $56, %rsp 57 ; FAST-ymm-zmm-NEXT: retq 58 ; 59 ; BTVER2-LABEL: test01: 60 ; BTVER2: # %bb.0: 61 ; BTVER2-NEXT: subq $56, %rsp 62 ; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm0 63 ; BTVER2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 64 ; BTVER2-NEXT: callq do_sse 65 ; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) 66 ; BTVER2-NEXT: callq do_sse 67 ; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) 68 ; BTVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 69 ; BTVER2-NEXT: addq $56, %rsp 70 ; BTVER2-NEXT: retq 71 %tmp = load <4 x float>, <4 x float>* @x, align 16 72 %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind 73 store <4 x float> %call, <4 x float>* @x, align 16 74 %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind 75 store <4 x float> %call2, <4 x float>* @x, align 16 76 ret <8 x float> %c 77 } 78 79 ;; Check that vzeroupper is emitted for tail calls. 80 81 define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind { 82 ; VZ-LABEL: test02: 83 ; VZ: # %bb.0: 84 ; VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0 85 ; VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 86 ; VZ-NEXT: vzeroupper 87 ; VZ-NEXT: jmp do_sse # TAILCALL 88 ; 89 ; NO-VZ-LABEL: test02: 90 ; NO-VZ: # %bb.0: 91 ; NO-VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0 92 ; NO-VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 93 ; NO-VZ-NEXT: jmp do_sse # TAILCALL 94 %add.i = fadd <8 x float> %a, %b 95 %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0) 96 %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind 97 ret <4 x float> %call3 98 } 99 100 ;; Test the pass convergence and also that vzeroupper is only issued when necessary, 101 ;; for this function it should be only once 102 103 define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind { 104 ; VZ-LABEL: test03: 105 ; VZ: # %bb.0: # %entry 106 ; VZ-NEXT: pushq %rbx 107 ; VZ-NEXT: subq $16, %rsp 108 ; VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 109 ; VZ-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 110 ; VZ-NEXT: .p2align 4, 0x90 111 ; VZ-NEXT: .LBB3_1: # %while.cond 112 ; VZ-NEXT: # =>This Inner Loop Header: Depth=1 113 ; VZ-NEXT: callq foo 114 ; VZ-NEXT: testl %eax, %eax 115 ; VZ-NEXT: jne .LBB3_1 116 ; VZ-NEXT: # %bb.2: # %for.body.preheader 117 ; VZ-NEXT: movl $4, %ebx 118 ; VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 119 ; VZ-NEXT: .p2align 4, 0x90 120 ; VZ-NEXT: .LBB3_3: # %for.body 121 ; VZ-NEXT: # =>This Inner Loop Header: Depth=1 122 ; VZ-NEXT: callq do_sse 123 ; VZ-NEXT: callq do_sse 124 ; VZ-NEXT: vmovaps {{.*}}(%rip), %ymm0 125 ; VZ-NEXT: vextractf128 $1, %ymm0, %xmm0 126 ; VZ-NEXT: vzeroupper 127 ; VZ-NEXT: callq do_sse 128 ; VZ-NEXT: decl %ebx 129 ; VZ-NEXT: jne .LBB3_3 130 ; VZ-NEXT: # %bb.4: # %for.end 131 ; VZ-NEXT: addq $16, %rsp 132 ; VZ-NEXT: popq %rbx 133 ; VZ-NEXT: retq 134 ; 135 ; FAST-ymm-zmm-LABEL: test03: 136 ; FAST-ymm-zmm: # %bb.0: # %entry 137 ; FAST-ymm-zmm-NEXT: pushq %rbx 138 ; FAST-ymm-zmm-NEXT: subq $16, %rsp 139 ; FAST-ymm-zmm-NEXT: vaddps %xmm1, %xmm0, %xmm0 140 ; FAST-ymm-zmm-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 141 ; FAST-ymm-zmm-NEXT: .p2align 4, 0x90 142 ; FAST-ymm-zmm-NEXT: .LBB3_1: # %while.cond 143 ; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1 144 ; FAST-ymm-zmm-NEXT: callq foo 145 ; FAST-ymm-zmm-NEXT: testl %eax, %eax 146 ; FAST-ymm-zmm-NEXT: jne .LBB3_1 147 ; FAST-ymm-zmm-NEXT: # %bb.2: # %for.body.preheader 148 ; FAST-ymm-zmm-NEXT: movl $4, %ebx 149 ; FAST-ymm-zmm-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 150 ; FAST-ymm-zmm-NEXT: .p2align 4, 0x90 151 ; FAST-ymm-zmm-NEXT: .LBB3_3: # %for.body 152 ; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1 153 ; FAST-ymm-zmm-NEXT: callq do_sse 154 ; FAST-ymm-zmm-NEXT: callq do_sse 155 ; FAST-ymm-zmm-NEXT: vmovaps {{.*}}(%rip), %ymm0 156 ; FAST-ymm-zmm-NEXT: vextractf128 $1, %ymm0, %xmm0 157 ; FAST-ymm-zmm-NEXT: callq do_sse 158 ; FAST-ymm-zmm-NEXT: decl %ebx 159 ; FAST-ymm-zmm-NEXT: jne .LBB3_3 160 ; FAST-ymm-zmm-NEXT: # %bb.4: # %for.end 161 ; FAST-ymm-zmm-NEXT: addq $16, %rsp 162 ; FAST-ymm-zmm-NEXT: popq %rbx 163 ; FAST-ymm-zmm-NEXT: retq 164 ; 165 ; BTVER2-LABEL: test03: 166 ; BTVER2: # %bb.0: # %entry 167 ; BTVER2-NEXT: pushq %rbx 168 ; BTVER2-NEXT: subq $16, %rsp 169 ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 170 ; BTVER2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 171 ; BTVER2-NEXT: .p2align 4, 0x90 172 ; BTVER2-NEXT: .LBB3_1: # %while.cond 173 ; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 174 ; BTVER2-NEXT: callq foo 175 ; BTVER2-NEXT: testl %eax, %eax 176 ; BTVER2-NEXT: jne .LBB3_1 177 ; BTVER2-NEXT: # %bb.2: # %for.body.preheader 178 ; BTVER2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 179 ; BTVER2-NEXT: movl $4, %ebx 180 ; BTVER2-NEXT: .p2align 4, 0x90 181 ; BTVER2-NEXT: .LBB3_3: # %for.body 182 ; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 183 ; BTVER2-NEXT: callq do_sse 184 ; BTVER2-NEXT: callq do_sse 185 ; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm0 186 ; BTVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 187 ; BTVER2-NEXT: callq do_sse 188 ; BTVER2-NEXT: decl %ebx 189 ; BTVER2-NEXT: jne .LBB3_3 190 ; BTVER2-NEXT: # %bb.4: # %for.end 191 ; BTVER2-NEXT: addq $16, %rsp 192 ; BTVER2-NEXT: popq %rbx 193 ; BTVER2-NEXT: retq 194 entry: 195 %add.i = fadd <4 x float> %a, %b 196 br label %while.cond 197 198 while.cond: 199 %call = tail call i32 @foo() 200 %tobool = icmp eq i32 %call, 0 201 br i1 %tobool, label %for.body, label %while.cond 202 203 for.body: 204 %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ] 205 %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ] 206 %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind 207 %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind 208 %tmp11 = load <8 x float>, <8 x float>* @g, align 32 209 %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind 210 %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind 211 %1 = add nsw i32 %i.018, 1 212 %exitcond = icmp eq i32 %1, 4 213 br i1 %exitcond, label %for.end, label %for.body 214 215 for.end: 216 ret <4 x float> %call14 217 } 218 219 ;; Check that we also perform vzeroupper when we return from a function. 220 221 define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind { 222 ; VZ-LABEL: test04: 223 ; VZ: # %bb.0: 224 ; VZ-NEXT: pushq %rax 225 ; VZ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 226 ; VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 227 ; VZ-NEXT: callq do_avx 228 ; VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 229 ; VZ-NEXT: popq %rax 230 ; VZ-NEXT: vzeroupper 231 ; VZ-NEXT: retq 232 ; 233 ; NO-VZ-LABEL: test04: 234 ; NO-VZ: # %bb.0: 235 ; NO-VZ-NEXT: pushq %rax 236 ; NO-VZ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 237 ; NO-VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 238 ; NO-VZ-NEXT: callq do_avx 239 ; NO-VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 240 ; NO-VZ-NEXT: popq %rax 241 ; NO-VZ-NEXT: retq 242 %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 243 %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind 244 %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 245 ret <4 x float> %shuf2 246 } 247 248