1 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI 2 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI 3 4 ; Check that WQM isn't triggered by image load/store intrinsics. 5 ; 6 ;CHECK-LABEL: {{^}}test1: 7 ;CHECK-NOT: s_wqm 8 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) { 9 main_body: 10 %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) 11 call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) 12 ret <4 x float> %tex 13 } 14 15 ; Check that WQM is triggered by image samples and left untouched for loads... 16 ; 17 ;CHECK-LABEL: {{^}}test2: 18 ;CHECK-NEXT: ; %main_body 19 ;CHECK-NEXT: s_wqm_b64 exec, exec 20 ;CHECK: image_sample 21 ;CHECK-NOT: exec 22 ;CHECK: _load_dword v0, 23 define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { 24 main_body: 25 %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 26 %c.2 = bitcast <4 x float> %c.1 to <4 x i32> 27 %c.3 = extractelement <4 x i32> %c.2, i32 0 28 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3 29 %data = load float, float addrspace(1)* %gep 30 ret float %data 31 } 32 33 ; ... but disabled for stores (and, in this simple case, not re-enabled). 34 ; 35 ;CHECK-LABEL: {{^}}test3: 36 ;CHECK-NEXT: ; %main_body 37 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 38 ;CHECK-NEXT: s_wqm_b64 exec, exec 39 ;CHECK: image_sample 40 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 41 ;CHECK: store 42 ;CHECK-NOT: exec 43 ;CHECK: .size test3 44 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { 45 main_body: 46 %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 47 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 48 %tex.2 = extractelement <4 x i32> %tex.1, i32 0 49 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2 50 %wr = extractelement <4 x float> %tex, i32 1 51 store float %wr, float addrspace(1)* %gep 52 ret <4 x float> %tex 53 } 54 55 ; Check that WQM is re-enabled when required. 56 ; 57 ;CHECK-LABEL: {{^}}test4: 58 ;CHECK-NEXT: ; %main_body 59 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 60 ;CHECK-NEXT: s_wqm_b64 exec, exec 61 ;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1 62 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 63 ;CHECK: store 64 ;CHECK: s_wqm_b64 exec, exec 65 ;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf 66 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { 67 main_body: 68 %c.1 = mul i32 %c, %d 69 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1 70 store float %data, float addrspace(1)* %gep 71 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 72 ret <4 x float> %tex 73 } 74 75 ; Check a case of one branch of an if-else requiring WQM, the other requiring 76 ; exact. 77 ; 78 ; Note: In this particular case, the save-and-restore could be avoided if the 79 ; analysis understood that the two branches of the if-else are mutually 80 ; exclusive. 81 ; 82 ;CHECK-LABEL: {{^}}test_control_flow_0: 83 ;CHECK-NEXT: ; %main_body 84 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 85 ;CHECK-NEXT: s_wqm_b64 exec, exec 86 ;CHECK: %ELSE 87 ;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 88 ;CHECK: store 89 ;CHECK: s_mov_b64 exec, [[SAVED]] 90 ;CHECK: %IF 91 ;CHECK: image_sample 92 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { 93 main_body: 94 %cmp = icmp eq i32 %z, 0 95 br i1 %cmp, label %IF, label %ELSE 96 97 IF: 98 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 99 %data.if = extractelement <4 x float> %tex, i32 0 100 br label %END 101 102 ELSE: 103 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c 104 store float %data, float addrspace(1)* %gep 105 br label %END 106 107 END: 108 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 109 ret float %r 110 } 111 112 ; Reverse branch order compared to the previous test. 113 ; 114 ;CHECK-LABEL: {{^}}test_control_flow_1: 115 ;CHECK-NEXT: ; %main_body 116 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 117 ;CHECK-NEXT: s_wqm_b64 exec, exec 118 ;CHECK: %IF 119 ;CHECK: image_sample 120 ;CHECK: %Flow 121 ;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], 122 ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] 123 ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] 124 ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] 125 ;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]] 126 ;CHECK-NEXT: ; BB#3: ; %ELSE 127 ;CHECK: store_dword 128 ;CHECK: [[END_BB]]: ; %END 129 ;CHECK: s_or_b64 exec, exec, 130 ;CHECK: v_mov_b32_e32 v0 131 ;CHECK: ; return 132 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { 133 main_body: 134 %cmp = icmp eq i32 %z, 0 135 br i1 %cmp, label %ELSE, label %IF 136 137 IF: 138 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 139 %data.if = extractelement <4 x float> %tex, i32 0 140 br label %END 141 142 ELSE: 143 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c 144 store float %data, float addrspace(1)* %gep 145 br label %END 146 147 END: 148 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 149 ret float %r 150 } 151 152 ; Check that branch conditions are properly marked as needing WQM... 153 ; 154 ;CHECK-LABEL: {{^}}test_control_flow_2: 155 ;CHECK-NEXT: ; %main_body 156 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 157 ;CHECK-NEXT: s_wqm_b64 exec, exec 158 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 159 ;CHECK: store 160 ;CHECK: s_wqm_b64 exec, exec 161 ;CHECK: load 162 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 163 ;CHECK: store 164 ;CHECK: s_wqm_b64 exec, exec 165 ;CHECK: v_cmp 166 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { 167 main_body: 168 %idx.1 = extractelement <3 x i32> %idx, i32 0 169 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 170 %data.1 = extractelement <2 x float> %data, i32 0 171 store float %data.1, float addrspace(1)* %gep.1 172 173 ; The load that determines the branch (and should therefore be WQM) is 174 ; surrounded by stores that require disabled WQM. 175 %idx.2 = extractelement <3 x i32> %idx, i32 1 176 %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 177 %z = load float, float addrspace(1)* %gep.2 178 179 %idx.3 = extractelement <3 x i32> %idx, i32 2 180 %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 181 %data.3 = extractelement <2 x float> %data, i32 1 182 store float %data.3, float addrspace(1)* %gep.3 183 184 %cc = fcmp ogt float %z, 0.0 185 br i1 %cc, label %IF, label %ELSE 186 187 IF: 188 %coord.IF = mul i32 %coord, 3 189 br label %END 190 191 ELSE: 192 %coord.ELSE = mul i32 %coord, 4 193 br label %END 194 195 END: 196 %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] 197 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 198 ret <4 x float> %tex 199 } 200 201 ; ... but only if they really do need it. 202 ; 203 ;CHECK-LABEL: {{^}}test_control_flow_3: 204 ;CHECK-NEXT: ; %main_body 205 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 206 ;CHECK-NEXT: s_wqm_b64 exec, exec 207 ;CHECK: image_sample 208 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 209 ;CHECK: store 210 ;CHECK: load 211 ;CHECK: store 212 ;CHECK: v_cmp 213 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { 214 main_body: 215 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 216 %tex.1 = extractelement <4 x float> %tex, i32 0 217 218 %idx.1 = extractelement <3 x i32> %idx, i32 0 219 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 220 %data.1 = extractelement <2 x float> %data, i32 0 221 store float %data.1, float addrspace(1)* %gep.1 222 223 %idx.2 = extractelement <3 x i32> %idx, i32 1 224 %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 225 %z = load float, float addrspace(1)* %gep.2 226 227 %idx.3 = extractelement <3 x i32> %idx, i32 2 228 %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 229 %data.3 = extractelement <2 x float> %data, i32 1 230 store float %data.3, float addrspace(1)* %gep.3 231 232 %cc = fcmp ogt float %z, 0.0 233 br i1 %cc, label %IF, label %ELSE 234 235 IF: 236 %tex.IF = fmul float %tex.1, 3.0 237 br label %END 238 239 ELSE: 240 %tex.ELSE = fmul float %tex.1, 4.0 241 br label %END 242 243 END: 244 %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ] 245 ret float %tex.END 246 } 247 248 ; Another test that failed at some point because of terminator handling. 249 ; 250 ;CHECK-LABEL: {{^}}test_control_flow_4: 251 ;CHECK-NEXT: ; %main_body 252 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 253 ;CHECK-NEXT: s_wqm_b64 exec, exec 254 ;CHECK: %IF 255 ;CHECK: load 256 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 257 ;CHECK: store 258 ;CHECK: s_mov_b64 exec, [[SAVE]] 259 ;CHECK: %END 260 ;CHECK: image_sample 261 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) { 262 main_body: 263 %cond = icmp eq i32 %y, 0 264 br i1 %cond, label %IF, label %END 265 266 IF: 267 %data = load float, float addrspace(1)* %ptr 268 %gep = getelementptr float, float addrspace(1)* %ptr, i32 1 269 store float %data, float addrspace(1)* %gep 270 br label %END 271 272 END: 273 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 274 ret <4 x float> %tex 275 } 276 277 ; Kill is performed in WQM mode so that uniform kill behaves correctly ... 278 ; 279 ;CHECK-LABEL: {{^}}test_kill_0: 280 ;CHECK-NEXT: ; %main_body 281 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 282 ;CHECK-NEXT: s_wqm_b64 exec, exec 283 ;CHECK: image_sample 284 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 285 ;SI: buffer_store_dword 286 ;VI: flat_store_dword 287 ;CHECK: s_wqm_b64 exec, exec 288 ;CHECK: v_cmpx_ 289 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 290 ;SI: buffer_store_dword 291 ;VI: flat_store_dword 292 ;CHECK: s_mov_b64 exec, [[SAVE]] 293 ;CHECK: image_sample 294 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) { 295 main_body: 296 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 297 298 %idx.0 = extractelement <2 x i32> %idx, i32 0 299 %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0 300 %data.0 = extractelement <2 x float> %data, i32 0 301 store float %data.0, float addrspace(1)* %gep.0 302 303 call void @llvm.AMDGPU.kill(float %z) 304 305 %idx.1 = extractelement <2 x i32> %idx, i32 1 306 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 307 %data.1 = extractelement <2 x float> %data, i32 1 308 store float %data.1, float addrspace(1)* %gep.1 309 310 %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 311 %out = fadd <4 x float> %tex, %tex2 312 313 ret <4 x float> %out 314 } 315 316 ; ... but only if WQM is necessary. 317 ; 318 ; CHECK-LABEL: {{^}}test_kill_1: 319 ; CHECK-NEXT: ; %main_body 320 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 321 ; CHECK: s_wqm_b64 exec, exec 322 ; CHECK: image_sample 323 ; CHECK: s_and_b64 exec, exec, [[ORIG]] 324 ; SI: buffer_store_dword 325 ; VI: flat_store_dword 326 ; CHECK-NOT: wqm 327 ; CHECK: v_cmpx_ 328 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { 329 main_body: 330 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 331 332 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx 333 store float %data, float addrspace(1)* %gep 334 335 call void @llvm.AMDGPU.kill(float %z) 336 337 ret <4 x float> %tex 338 } 339 340 ; Check prolog shaders. 341 ; 342 ; CHECK-LABEL: {{^}}test_prolog_1: 343 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 344 ; CHECK: s_wqm_b64 exec, exec 345 ; CHECK: v_add_f32_e32 v0, 346 ; CHECK: s_and_b64 exec, exec, [[ORIG]] 347 define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 { 348 main_body: 349 %s = fadd float %a, %b 350 ret float %s 351 } 352 353 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 354 355 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 356 357 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 358 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 359 360 declare void @llvm.AMDGPU.kill(float) 361 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) 362 363 attributes #1 = { nounwind } 364 attributes #2 = { nounwind readonly } 365 attributes #3 = { nounwind readnone } 366 attributes #4 = { "amdgpu-ps-wqm-outputs" } 367