Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX
      3 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512
      4 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm
      5 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2
      6 
      7 declare i32 @foo()
      8 declare <4 x float> @do_sse(<4 x float>)
      9 declare <8 x float> @do_avx(<8 x float>)
     10 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
     11 @x = common global <4 x float> zeroinitializer, align 16
     12 @g = common global <8 x float> zeroinitializer, align 32
     13 
     14 ;; Basic checking - don't emit any vzeroupper instruction
     15 
     16 define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind {
     17 ; ALL-LABEL: test00:
     18 ; ALL:       # %bb.0:
     19 ; ALL-NEXT:    pushq %rax
     20 ; ALL-NEXT:    vaddps %xmm1, %xmm0, %xmm0
     21 ; ALL-NEXT:    callq do_sse
     22 ; ALL-NEXT:    popq %rax
     23 ; ALL-NEXT:    retq
     24   %add.i = fadd <4 x float> %a, %b
     25   %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
     26   ret <4 x float> %call3
     27 }
     28 
     29 ;; Check parameter 256-bit parameter passing
     30 
     31 define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind {
     32 ; VZ-LABEL: test01:
     33 ; VZ:       # %bb.0:
     34 ; VZ-NEXT:    subq $56, %rsp
     35 ; VZ-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
     36 ; VZ-NEXT:    vmovaps {{.*}}(%rip), %xmm0
     37 ; VZ-NEXT:    vzeroupper
     38 ; VZ-NEXT:    callq do_sse
     39 ; VZ-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
     40 ; VZ-NEXT:    callq do_sse
     41 ; VZ-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
     42 ; VZ-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
     43 ; VZ-NEXT:    addq $56, %rsp
     44 ; VZ-NEXT:    retq
     45 ;
     46 ; FAST-ymm-zmm-LABEL: test01:
     47 ; FAST-ymm-zmm:       # %bb.0:
     48 ; FAST-ymm-zmm-NEXT:    subq $56, %rsp
     49 ; FAST-ymm-zmm-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
     50 ; FAST-ymm-zmm-NEXT:    vmovaps {{.*}}(%rip), %xmm0
     51 ; FAST-ymm-zmm-NEXT:    callq do_sse
     52 ; FAST-ymm-zmm-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
     53 ; FAST-ymm-zmm-NEXT:    callq do_sse
     54 ; FAST-ymm-zmm-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
     55 ; FAST-ymm-zmm-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
     56 ; FAST-ymm-zmm-NEXT:    addq $56, %rsp
     57 ; FAST-ymm-zmm-NEXT:    retq
     58 ;
     59 ; BTVER2-LABEL: test01:
     60 ; BTVER2:       # %bb.0:
     61 ; BTVER2-NEXT:    subq $56, %rsp
     62 ; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm0
     63 ; BTVER2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
     64 ; BTVER2-NEXT:    callq do_sse
     65 ; BTVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
     66 ; BTVER2-NEXT:    callq do_sse
     67 ; BTVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
     68 ; BTVER2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
     69 ; BTVER2-NEXT:    addq $56, %rsp
     70 ; BTVER2-NEXT:    retq
     71   %tmp = load <4 x float>, <4 x float>* @x, align 16
     72   %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
     73   store <4 x float> %call, <4 x float>* @x, align 16
     74   %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
     75   store <4 x float> %call2, <4 x float>* @x, align 16
     76   ret <8 x float> %c
     77 }
     78 
     79 ;; Check that vzeroupper is emitted for tail calls.
     80 
     81 define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
     82 ; VZ-LABEL: test02:
     83 ; VZ:       # %bb.0:
     84 ; VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
     85 ; VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
     86 ; VZ-NEXT:    vzeroupper
     87 ; VZ-NEXT:    jmp do_sse # TAILCALL
     88 ;
     89 ; NO-VZ-LABEL: test02:
     90 ; NO-VZ:       # %bb.0:
     91 ; NO-VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
     92 ; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
     93 ; NO-VZ-NEXT:    jmp do_sse # TAILCALL
     94   %add.i = fadd <8 x float> %a, %b
     95   %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
     96   %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind
     97   ret <4 x float> %call3
     98 }
     99 
    100 ;; Test the pass convergence and also that vzeroupper is only issued when necessary,
    101 ;; for this function it should be only once
    102 
    103 define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
    104 ; VZ-LABEL: test03:
    105 ; VZ:       # %bb.0: # %entry
    106 ; VZ-NEXT:    pushq %rbx
    107 ; VZ-NEXT:    subq $16, %rsp
    108 ; VZ-NEXT:    vaddps %xmm1, %xmm0, %xmm0
    109 ; VZ-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
    110 ; VZ-NEXT:    .p2align 4, 0x90
    111 ; VZ-NEXT:  .LBB3_1: # %while.cond
    112 ; VZ-NEXT:    # =>This Inner Loop Header: Depth=1
    113 ; VZ-NEXT:    callq foo
    114 ; VZ-NEXT:    testl %eax, %eax
    115 ; VZ-NEXT:    jne .LBB3_1
    116 ; VZ-NEXT:  # %bb.2: # %for.body.preheader
    117 ; VZ-NEXT:    movl $4, %ebx
    118 ; VZ-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
    119 ; VZ-NEXT:    .p2align 4, 0x90
    120 ; VZ-NEXT:  .LBB3_3: # %for.body
    121 ; VZ-NEXT:    # =>This Inner Loop Header: Depth=1
    122 ; VZ-NEXT:    callq do_sse
    123 ; VZ-NEXT:    callq do_sse
    124 ; VZ-NEXT:    vmovaps {{.*}}(%rip), %ymm0
    125 ; VZ-NEXT:    vextractf128 $1, %ymm0, %xmm0
    126 ; VZ-NEXT:    vzeroupper
    127 ; VZ-NEXT:    callq do_sse
    128 ; VZ-NEXT:    decl %ebx
    129 ; VZ-NEXT:    jne .LBB3_3
    130 ; VZ-NEXT:  # %bb.4: # %for.end
    131 ; VZ-NEXT:    addq $16, %rsp
    132 ; VZ-NEXT:    popq %rbx
    133 ; VZ-NEXT:    retq
    134 ;
    135 ; FAST-ymm-zmm-LABEL: test03:
    136 ; FAST-ymm-zmm:       # %bb.0: # %entry
    137 ; FAST-ymm-zmm-NEXT:    pushq %rbx
    138 ; FAST-ymm-zmm-NEXT:    subq $16, %rsp
    139 ; FAST-ymm-zmm-NEXT:    vaddps %xmm1, %xmm0, %xmm0
    140 ; FAST-ymm-zmm-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
    141 ; FAST-ymm-zmm-NEXT:    .p2align 4, 0x90
    142 ; FAST-ymm-zmm-NEXT:  .LBB3_1: # %while.cond
    143 ; FAST-ymm-zmm-NEXT:    # =>This Inner Loop Header: Depth=1
    144 ; FAST-ymm-zmm-NEXT:    callq foo
    145 ; FAST-ymm-zmm-NEXT:    testl %eax, %eax
    146 ; FAST-ymm-zmm-NEXT:    jne .LBB3_1
    147 ; FAST-ymm-zmm-NEXT:  # %bb.2: # %for.body.preheader
    148 ; FAST-ymm-zmm-NEXT:    movl $4, %ebx
    149 ; FAST-ymm-zmm-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
    150 ; FAST-ymm-zmm-NEXT:    .p2align 4, 0x90
    151 ; FAST-ymm-zmm-NEXT:  .LBB3_3: # %for.body
    152 ; FAST-ymm-zmm-NEXT:    # =>This Inner Loop Header: Depth=1
    153 ; FAST-ymm-zmm-NEXT:    callq do_sse
    154 ; FAST-ymm-zmm-NEXT:    callq do_sse
    155 ; FAST-ymm-zmm-NEXT:    vmovaps {{.*}}(%rip), %ymm0
    156 ; FAST-ymm-zmm-NEXT:    vextractf128 $1, %ymm0, %xmm0
    157 ; FAST-ymm-zmm-NEXT:    callq do_sse
    158 ; FAST-ymm-zmm-NEXT:    decl %ebx
    159 ; FAST-ymm-zmm-NEXT:    jne .LBB3_3
    160 ; FAST-ymm-zmm-NEXT:  # %bb.4: # %for.end
    161 ; FAST-ymm-zmm-NEXT:    addq $16, %rsp
    162 ; FAST-ymm-zmm-NEXT:    popq %rbx
    163 ; FAST-ymm-zmm-NEXT:    retq
    164 ;
    165 ; BTVER2-LABEL: test03:
    166 ; BTVER2:       # %bb.0: # %entry
    167 ; BTVER2-NEXT:    pushq %rbx
    168 ; BTVER2-NEXT:    subq $16, %rsp
    169 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
    170 ; BTVER2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
    171 ; BTVER2-NEXT:    .p2align 4, 0x90
    172 ; BTVER2-NEXT:  .LBB3_1: # %while.cond
    173 ; BTVER2-NEXT:    # =>This Inner Loop Header: Depth=1
    174 ; BTVER2-NEXT:    callq foo
    175 ; BTVER2-NEXT:    testl %eax, %eax
    176 ; BTVER2-NEXT:    jne .LBB3_1
    177 ; BTVER2-NEXT:  # %bb.2: # %for.body.preheader
    178 ; BTVER2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
    179 ; BTVER2-NEXT:    movl $4, %ebx
    180 ; BTVER2-NEXT:    .p2align 4, 0x90
    181 ; BTVER2-NEXT:  .LBB3_3: # %for.body
    182 ; BTVER2-NEXT:    # =>This Inner Loop Header: Depth=1
    183 ; BTVER2-NEXT:    callq do_sse
    184 ; BTVER2-NEXT:    callq do_sse
    185 ; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm0
    186 ; BTVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0
    187 ; BTVER2-NEXT:    callq do_sse
    188 ; BTVER2-NEXT:    decl %ebx
    189 ; BTVER2-NEXT:    jne .LBB3_3
    190 ; BTVER2-NEXT:  # %bb.4: # %for.end
    191 ; BTVER2-NEXT:    addq $16, %rsp
    192 ; BTVER2-NEXT:    popq %rbx
    193 ; BTVER2-NEXT:    retq
    194 entry:
    195   %add.i = fadd <4 x float> %a, %b
    196   br label %while.cond
    197 
    198 while.cond:
    199   %call = tail call i32 @foo()
    200   %tobool = icmp eq i32 %call, 0
    201   br i1 %tobool, label %for.body, label %while.cond
    202 
    203 for.body:
    204   %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ]
    205   %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ]
    206   %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
    207   %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
    208   %tmp11 = load <8 x float>, <8 x float>* @g, align 32
    209   %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
    210   %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
    211   %1 = add nsw i32 %i.018, 1
    212   %exitcond = icmp eq i32 %1, 4
    213   br i1 %exitcond, label %for.end, label %for.body
    214 
    215 for.end:
    216   ret <4 x float> %call14
    217 }
    218 
    219 ;; Check that we also perform vzeroupper when we return from a function.
    220 
    221 define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind {
    222 ; VZ-LABEL: test04:
    223 ; VZ:       # %bb.0:
    224 ; VZ-NEXT:    pushq %rax
    225 ; VZ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    226 ; VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    227 ; VZ-NEXT:    callq do_avx
    228 ; VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    229 ; VZ-NEXT:    popq %rax
    230 ; VZ-NEXT:    vzeroupper
    231 ; VZ-NEXT:    retq
    232 ;
    233 ; NO-VZ-LABEL: test04:
    234 ; NO-VZ:       # %bb.0:
    235 ; NO-VZ-NEXT:    pushq %rax
    236 ; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    237 ; NO-VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    238 ; NO-VZ-NEXT:    callq do_avx
    239 ; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    240 ; NO-VZ-NEXT:    popq %rax
    241 ; NO-VZ-NEXT:    retq
    242   %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    243   %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
    244   %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    245   ret <4 x float> %shuf2
    246 }
    247 
    248