Home | History | Annotate | Download | only in AMDGPU
      1 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
      2 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
      3 
      4 ; Check that WQM isn't triggered by image load/store intrinsics.
      5 ;
      6 ;CHECK-LABEL: {{^}}test1:
      7 ;CHECK-NOT: s_wqm
      8 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
      9 main_body:
     10   %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
     11   call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
     12   ret <4 x float> %tex
     13 }
     14 
     15 ; Check that WQM is triggered by image samples and left untouched for loads...
     16 ;
     17 ;CHECK-LABEL: {{^}}test2:
     18 ;CHECK-NEXT: ; %main_body
     19 ;CHECK-NEXT: s_wqm_b64 exec, exec
     20 ;CHECK: image_sample
     21 ;CHECK-NOT: exec
     22 ;CHECK: _load_dword v0,
     23 define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
     24 main_body:
     25   %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
     26   %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
     27   %c.3 = extractelement <4 x i32> %c.2, i32 0
     28   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
     29   %data = load float, float addrspace(1)* %gep
     30   ret float %data
     31 }
     32 
     33 ; ... but disabled for stores (and, in this simple case, not re-enabled).
     34 ;
     35 ;CHECK-LABEL: {{^}}test3:
     36 ;CHECK-NEXT: ; %main_body
     37 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
     38 ;CHECK-NEXT: s_wqm_b64 exec, exec
     39 ;CHECK: image_sample
     40 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
     41 ;CHECK: store
     42 ;CHECK-NOT: exec
     43 ;CHECK: .size test3
     44 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
     45 main_body:
     46   %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
     47   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
     48   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
     49   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
     50   %wr = extractelement <4 x float> %tex, i32 1
     51   store float %wr, float addrspace(1)* %gep
     52   ret <4 x float> %tex
     53 }
     54 
     55 ; Check that WQM is re-enabled when required.
     56 ;
     57 ;CHECK-LABEL: {{^}}test4:
     58 ;CHECK-NEXT: ; %main_body
     59 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
     60 ;CHECK-NEXT: s_wqm_b64 exec, exec
     61 ;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
     62 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
     63 ;CHECK: store
     64 ;CHECK: s_wqm_b64 exec, exec
     65 ;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
     66 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
     67 main_body:
     68   %c.1 = mul i32 %c, %d
     69   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
     70   store float %data, float addrspace(1)* %gep
     71   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
     72   ret <4 x float> %tex
     73 }
     74 
     75 ; Check a case of one branch of an if-else requiring WQM, the other requiring
     76 ; exact.
     77 ;
     78 ; Note: In this particular case, the save-and-restore could be avoided if the
     79 ; analysis understood that the two branches of the if-else are mutually
     80 ; exclusive.
     81 ;
     82 ;CHECK-LABEL: {{^}}test_control_flow_0:
     83 ;CHECK-NEXT: ; %main_body
     84 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
     85 ;CHECK-NEXT: s_wqm_b64 exec, exec
     86 ;CHECK: %ELSE
     87 ;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
     88 ;CHECK: store
     89 ;CHECK: s_mov_b64 exec, [[SAVED]]
     90 ;CHECK: %IF
     91 ;CHECK: image_sample
     92 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
     93 main_body:
     94   %cmp = icmp eq i32 %z, 0
     95   br i1 %cmp, label %IF, label %ELSE
     96 
     97 IF:
     98   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
     99   %data.if = extractelement <4 x float> %tex, i32 0
    100   br label %END
    101 
    102 ELSE:
    103   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
    104   store float %data, float addrspace(1)* %gep
    105   br label %END
    106 
    107 END:
    108   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
    109   ret float %r
    110 }
    111 
    112 ; Reverse branch order compared to the previous test.
    113 ;
    114 ;CHECK-LABEL: {{^}}test_control_flow_1:
    115 ;CHECK-NEXT: ; %main_body
    116 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    117 ;CHECK-NEXT: s_wqm_b64 exec, exec
    118 ;CHECK: %IF
    119 ;CHECK: image_sample
    120 ;CHECK: %Flow
    121 ;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
    122 ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
    123 ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
    124 ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
    125 ;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
    126 ;CHECK-NEXT: ; BB#3: ; %ELSE
    127 ;CHECK: store_dword
    128 ;CHECK: [[END_BB]]: ; %END
    129 ;CHECK: s_or_b64 exec, exec,
    130 ;CHECK: v_mov_b32_e32 v0
    131 ;CHECK: ; return
    132 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
    133 main_body:
    134   %cmp = icmp eq i32 %z, 0
    135   br i1 %cmp, label %ELSE, label %IF
    136 
    137 IF:
    138   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
    139   %data.if = extractelement <4 x float> %tex, i32 0
    140   br label %END
    141 
    142 ELSE:
    143   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
    144   store float %data, float addrspace(1)* %gep
    145   br label %END
    146 
    147 END:
    148   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
    149   ret float %r
    150 }
    151 
    152 ; Check that branch conditions are properly marked as needing WQM...
    153 ;
    154 ;CHECK-LABEL: {{^}}test_control_flow_2:
    155 ;CHECK-NEXT: ; %main_body
    156 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    157 ;CHECK-NEXT: s_wqm_b64 exec, exec
    158 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
    159 ;CHECK: store
    160 ;CHECK: s_wqm_b64 exec, exec
    161 ;CHECK: load
    162 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
    163 ;CHECK: store
    164 ;CHECK: s_wqm_b64 exec, exec
    165 ;CHECK: v_cmp
    166 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
    167 main_body:
    168   %idx.1 = extractelement <3 x i32> %idx, i32 0
    169   %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
    170   %data.1 = extractelement <2 x float> %data, i32 0
    171   store float %data.1, float addrspace(1)* %gep.1
    172 
    173   ; The load that determines the branch (and should therefore be WQM) is
    174   ; surrounded by stores that require disabled WQM.
    175   %idx.2 = extractelement <3 x i32> %idx, i32 1
    176   %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
    177   %z = load float, float addrspace(1)* %gep.2
    178 
    179   %idx.3 = extractelement <3 x i32> %idx, i32 2
    180   %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
    181   %data.3 = extractelement <2 x float> %data, i32 1
    182   store float %data.3, float addrspace(1)* %gep.3
    183 
    184   %cc = fcmp ogt float %z, 0.0
    185   br i1 %cc, label %IF, label %ELSE
    186 
    187 IF:
    188   %coord.IF = mul i32 %coord, 3
    189   br label %END
    190 
    191 ELSE:
    192   %coord.ELSE = mul i32 %coord, 4
    193   br label %END
    194 
    195 END:
    196   %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
    197   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
    198   ret <4 x float> %tex
    199 }
    200 
    201 ; ... but only if they really do need it.
    202 ;
    203 ;CHECK-LABEL: {{^}}test_control_flow_3:
    204 ;CHECK-NEXT: ; %main_body
    205 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    206 ;CHECK-NEXT: s_wqm_b64 exec, exec
    207 ;CHECK: image_sample
    208 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
    209 ;CHECK: store
    210 ;CHECK: load
    211 ;CHECK: store
    212 ;CHECK: v_cmp
    213 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
    214 main_body:
    215   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
    216   %tex.1 = extractelement <4 x float> %tex, i32 0
    217 
    218   %idx.1 = extractelement <3 x i32> %idx, i32 0
    219   %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
    220   %data.1 = extractelement <2 x float> %data, i32 0
    221   store float %data.1, float addrspace(1)* %gep.1
    222 
    223   %idx.2 = extractelement <3 x i32> %idx, i32 1
    224   %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
    225   %z = load float, float addrspace(1)* %gep.2
    226 
    227   %idx.3 = extractelement <3 x i32> %idx, i32 2
    228   %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
    229   %data.3 = extractelement <2 x float> %data, i32 1
    230   store float %data.3, float addrspace(1)* %gep.3
    231 
    232   %cc = fcmp ogt float %z, 0.0
    233   br i1 %cc, label %IF, label %ELSE
    234 
    235 IF:
    236   %tex.IF = fmul float %tex.1, 3.0
    237   br label %END
    238 
    239 ELSE:
    240   %tex.ELSE = fmul float %tex.1, 4.0
    241   br label %END
    242 
    243 END:
    244   %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
    245   ret float %tex.END
    246 }
    247 
    248 ; Another test that failed at some point because of terminator handling.
    249 ;
    250 ;CHECK-LABEL: {{^}}test_control_flow_4:
    251 ;CHECK-NEXT: ; %main_body
    252 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    253 ;CHECK-NEXT: s_wqm_b64 exec, exec
    254 ;CHECK: %IF
    255 ;CHECK: load
    256 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
    257 ;CHECK: store
    258 ;CHECK: s_mov_b64 exec, [[SAVE]]
    259 ;CHECK: %END
    260 ;CHECK: image_sample
    261 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) {
    262 main_body:
    263   %cond = icmp eq i32 %y, 0
    264   br i1 %cond, label %IF, label %END
    265 
    266 IF:
    267   %data = load float, float addrspace(1)* %ptr
    268   %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
    269   store float %data, float addrspace(1)* %gep
    270   br label %END
    271 
    272 END:
    273   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
    274   ret <4 x float> %tex
    275 }
    276 
    277 ; Kill is performed in WQM mode so that uniform kill behaves correctly ...
    278 ;
    279 ;CHECK-LABEL: {{^}}test_kill_0:
    280 ;CHECK-NEXT: ; %main_body
    281 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    282 ;CHECK-NEXT: s_wqm_b64 exec, exec
    283 ;CHECK: image_sample
    284 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
    285 ;SI: buffer_store_dword
    286 ;VI: flat_store_dword
    287 ;CHECK: s_wqm_b64 exec, exec
    288 ;CHECK: v_cmpx_
    289 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
    290 ;SI: buffer_store_dword
    291 ;VI: flat_store_dword
    292 ;CHECK: s_mov_b64 exec, [[SAVE]]
    293 ;CHECK: image_sample
    294 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
    295 main_body:
    296   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
    297 
    298   %idx.0 = extractelement <2 x i32> %idx, i32 0
    299   %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
    300   %data.0 = extractelement <2 x float> %data, i32 0
    301   store float %data.0, float addrspace(1)* %gep.0
    302 
    303   call void @llvm.AMDGPU.kill(float %z)
    304 
    305   %idx.1 = extractelement <2 x i32> %idx, i32 1
    306   %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
    307   %data.1 = extractelement <2 x float> %data, i32 1
    308   store float %data.1, float addrspace(1)* %gep.1
    309 
    310   %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
    311   %out = fadd <4 x float> %tex, %tex2
    312 
    313   ret <4 x float> %out
    314 }
    315 
    316 ; ... but only if WQM is necessary.
    317 ;
    318 ; CHECK-LABEL: {{^}}test_kill_1:
    319 ; CHECK-NEXT: ; %main_body
    320 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    321 ; CHECK: s_wqm_b64 exec, exec
    322 ; CHECK: image_sample
    323 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
    324 ; SI: buffer_store_dword
    325 ; VI: flat_store_dword
    326 ; CHECK-NOT: wqm
    327 ; CHECK: v_cmpx_
    328 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
    329 main_body:
    330   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
    331 
    332   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
    333   store float %data, float addrspace(1)* %gep
    334 
    335   call void @llvm.AMDGPU.kill(float %z)
    336 
    337   ret <4 x float> %tex
    338 }
    339 
    340 ; Check prolog shaders.
    341 ;
    342 ; CHECK-LABEL: {{^}}test_prolog_1:
    343 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    344 ; CHECK: s_wqm_b64 exec, exec
    345 ; CHECK: v_add_f32_e32 v0,
    346 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
    347 define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
    348 main_body:
    349   %s = fadd float %a, %b
    350   ret float %s
    351 }
    352 
    353 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
    354 
    355 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
    356 
    357 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
    358 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
    359 
    360 declare void @llvm.AMDGPU.kill(float)
    361 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
    362 
    363 attributes #1 = { nounwind }
    364 attributes #2 = { nounwind readonly }
    365 attributes #3 = { nounwind readnone }
    366 attributes #4 = { "amdgpu-ps-wqm-outputs" }
    367