1 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s 2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s 3 4 ; Check that WQM isn't triggered by image load/store intrinsics. 5 ; 6 ;CHECK-LABEL: {{^}}test1: 7 ;CHECK-NOT: s_wqm 8 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) { 9 main_body: 10 %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) 11 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) 12 ret <4 x float> %tex 13 } 14 15 ; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible 16 ; 17 ;CHECK-LABEL: {{^}}test2: 18 ;CHECK-NEXT: ; %main_body 19 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 20 ;CHECK-NEXT: s_wqm_b64 exec, exec 21 ;CHECK: interp 22 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 23 ;CHECK-NOT: interp 24 ;CHECK: image_sample 25 ;CHECK-NOT: exec 26 ;CHECK: .size test2 27 define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { 28 main_body: 29 %inst23 = extractelement <2 x float> %pos, i32 0 30 %inst24 = extractelement <2 x float> %pos, i32 1 31 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 32 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 33 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 34 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 35 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 36 ret <4 x float> %tex 37 } 38 39 ; ... but disabled for stores (and, in this simple case, not re-enabled) ... 40 ; 41 ;CHECK-LABEL: {{^}}test3: 42 ;CHECK-NEXT: ; %main_body 43 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 44 ;CHECK-NEXT: s_wqm_b64 exec, exec 45 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 46 ;CHECK: image_sample 47 ;CHECK: store 48 ;CHECK-NOT: exec 49 ;CHECK: .size test3 50 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { 51 main_body: 52 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 53 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 54 %tex.2 = extractelement <4 x i32> %tex.1, i32 0 55 56 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0) 57 58 ret <4 x float> %tex 59 } 60 61 ; ... and disabled for export. 62 ; 63 ;CHECK-LABEL: {{^}}test3x: 64 ;CHECK-NEXT: ; %main_body 65 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 66 ;CHECK-NEXT: s_wqm_b64 exec, exec 67 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 68 ;CHECK: image_sample 69 ;CHECK: exp 70 ;CHECK-NOT: exec 71 ;CHECK: .size test3x 72 define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { 73 main_body: 74 %inst23 = extractelement <2 x float> %pos, i32 0 75 %inst24 = extractelement <2 x float> %pos, i32 1 76 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 77 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 78 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 79 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 80 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 81 %tex.0 = extractelement <4 x float> %tex, i32 0 82 %tex.1 = extractelement <4 x float> %tex, i32 1 83 %tex.2 = extractelement <4 x float> %tex, i32 2 84 %tex.3 = extractelement <4 x float> %tex, i32 3 85 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true) 86 ret void 87 } 88 89 ; Check that WQM is re-enabled when required. 90 ; 91 ;CHECK-LABEL: {{^}}test4: 92 ;CHECK-NEXT: ; %main_body 93 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 94 ;CHECK-NEXT: s_wqm_b64 exec, exec 95 ;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1 96 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 97 ;CHECK: store 98 ;CHECK: s_wqm_b64 exec, exec 99 ;CHECK: image_sample 100 ;CHECK: image_sample 101 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { 102 main_body: 103 %c.1 = mul i32 %c, %d 104 105 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0) 106 %c.1.bc = bitcast i32 %c.1 to float 107 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 108 %tex0 = extractelement <4 x float> %tex, i32 0 109 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 110 ret <4 x float> %dtex 111 } 112 113 ; Check that WQM is triggered by the wqm intrinsic. 114 ; 115 ;CHECK-LABEL: {{^}}test5: 116 ;CHECK: s_wqm_b64 exec, exec 117 ;CHECK: buffer_load_dword 118 ;CHECK: buffer_load_dword 119 ;CHECK: v_add_f32_e32 120 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { 121 main_body: 122 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) 123 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) 124 %out = fadd float %src0, %src1 125 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 126 ret float %out.0 127 } 128 129 ; Check that the wqm intrinsic works correctly for integers. 130 ; 131 ;CHECK-LABEL: {{^}}test6: 132 ;CHECK: s_wqm_b64 exec, exec 133 ;CHECK: buffer_load_dword 134 ;CHECK: buffer_load_dword 135 ;CHECK: v_add_f32_e32 136 define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { 137 main_body: 138 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) 139 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) 140 %out = fadd float %src0, %src1 141 %out.0 = bitcast float %out to i32 142 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) 143 %out.2 = bitcast i32 %out.1 to float 144 ret float %out.2 145 } 146 147 ; Check that WWM is triggered by the wwm intrinsic. 148 ; 149 ;CHECK-LABEL: {{^}}test_wwm1: 150 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 151 ;CHECK: buffer_load_dword 152 ;CHECK: buffer_load_dword 153 ;CHECK: v_add_f32_e32 154 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 155 main_body: 156 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) 157 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) 158 %out = fadd float %src0, %src1 159 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 160 ret float %out.0 161 } 162 163 ; Same as above, but with an integer type. 164 ; 165 ;CHECK-LABEL: {{^}}test_wwm2: 166 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 167 ;CHECK: buffer_load_dword 168 ;CHECK: buffer_load_dword 169 ;CHECK: v_add_{{[iu]}}32_e32 170 define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { 171 main_body: 172 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) 173 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) 174 %src0.0 = bitcast float %src0 to i32 175 %src1.0 = bitcast float %src1 to i32 176 %out = add i32 %src0.0, %src1.0 177 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) 178 %out.1 = bitcast i32 %out.0 to float 179 ret float %out.1 180 } 181 182 ; Check that we don't leave WWM on for computations that don't require WWM, 183 ; since that will lead clobbering things that aren't supposed to be clobbered 184 ; in cases like this. 185 ; 186 ;CHECK-LABEL: {{^}}test_wwm3: 187 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 188 ;CHECK: buffer_load_dword 189 ;CHECK: v_add_f32_e32 190 ;CHECK: s_mov_b64 exec, [[ORIG]] 191 ;CHECK: v_add_f32_e32 192 define amdgpu_ps float @test_wwm3(i32 inreg %idx) { 193 main_body: 194 ; use mbcnt to make sure the branch is divergent 195 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 196 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 197 %cc = icmp uge i32 %hi, 32 198 br i1 %cc, label %endif, label %if 199 200 if: 201 %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) 202 %out = fadd float %src, %src 203 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 204 %out.1 = fadd float %src, %out.0 205 br label %endif 206 207 endif: 208 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 209 ret float %out.2 210 } 211 212 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM 213 ; write could clobber disabled channels in the non-WWM one. 214 ; 215 ;CHECK-LABEL: {{^}}test_wwm4: 216 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 217 ;CHECK: buffer_load_dword 218 ;CHECK: v_add_f32_e32 219 ;CHECK: s_mov_b64 exec, [[ORIG]] 220 ;CHECK-NEXT: v_mov_b32_e32 221 define amdgpu_ps float @test_wwm4(i32 inreg %idx) { 222 main_body: 223 ; use mbcnt to make sure the branch is divergent 224 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 225 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 226 %cc = icmp uge i32 %hi, 32 227 br i1 %cc, label %endif, label %if 228 229 if: 230 %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) 231 %out = fadd float %src, %src 232 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 233 br label %endif 234 235 endif: 236 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 237 ret float %out.1 238 } 239 240 ; Make sure the transition from Exact to WWM then WQM works properly. 241 ; 242 ;CHECK-LABEL: {{^}}test_wwm5: 243 ;CHECK: buffer_load_dword 244 ;CHECK: buffer_store_dword 245 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 246 ;CHECK: buffer_load_dword 247 ;CHECK: v_add_f32_e32 248 ;CHECK: s_mov_b64 exec, [[ORIG]] 249 ;CHECK: s_wqm_b64 exec, exec 250 define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { 251 main_body: 252 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) 253 call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) 254 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) 255 %temp = fadd float %src1, %src1 256 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) 257 %out = fadd float %temp.0, %temp.0 258 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 259 ret float %out.0 260 } 261 262 ; Check that WWM is turned on correctly across basic block boundaries. 263 ; 264 ;CHECK-LABEL: {{^}}test_wwm6: 265 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 266 ;SI-CHECK: buffer_load_dword 267 ;VI-CHECK: flat_load_dword 268 ;CHECK: s_mov_b64 exec, [[ORIG]] 269 ;CHECK: %if 270 ;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 271 ;SI-CHECK: buffer_load_dword 272 ;VI-CHECK: flat_load_dword 273 ;CHECK: v_add_f32_e32 274 ;CHECK: s_mov_b64 exec, [[ORIG2]] 275 define amdgpu_ps float @test_wwm6() { 276 main_body: 277 %src0 = load volatile float, float addrspace(1)* undef 278 ; use mbcnt to make sure the branch is divergent 279 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 280 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 281 %cc = icmp uge i32 %hi, 32 282 br i1 %cc, label %endif, label %if 283 284 if: 285 %src1 = load volatile float, float addrspace(1)* undef 286 %out = fadd float %src0, %src1 287 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 288 br label %endif 289 290 endif: 291 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 292 ret float %out.1 293 } 294 295 ; Check that @llvm.amdgcn.set.inactive disables WWM. 296 ; 297 ;CHECK-LABEL: {{^}}test_set_inactive1: 298 ;CHECK: buffer_load_dword 299 ;CHECK: s_not_b64 exec, exec 300 ;CHECK: v_mov_b32_e32 301 ;CHECK: s_not_b64 exec, exec 302 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 303 ;CHECK: v_add_{{[iu]}}32_e32 304 define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) { 305 main_body: 306 %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) 307 %src.0 = bitcast float %src to i32 308 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) 309 %out = add i32 %src.1, %src.1 310 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) 311 %out.1 = bitcast i32 %out.0 to float 312 call void @llvm.amdgcn.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) 313 ret void 314 } 315 316 ; Check that enabling WQM anywhere enables WQM for the set.inactive source. 317 ; 318 ;CHECK-LABEL: {{^}}test_set_inactive2: 319 ;CHECK: s_wqm_b64 exec, exec 320 ;CHECK: buffer_load_dword 321 ;CHECK: buffer_load_dword 322 define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { 323 main_body: 324 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) 325 %src1.0 = bitcast float %src1 to i32 326 %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef) 327 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) 328 %src0.0 = bitcast float %src0 to i32 329 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0) 330 %out = add i32 %src0.1, %src1.1 331 %out.0 = bitcast i32 %out to float 332 call void @llvm.amdgcn.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) 333 ret void 334 } 335 336 ; Check a case of one branch of an if-else requiring WQM, the other requiring 337 ; exact. 338 ; 339 ; Note: In this particular case, the save-and-restore could be avoided if the 340 ; analysis understood that the two branches of the if-else are mutually 341 ; exclusive. 342 ; 343 ;CHECK-LABEL: {{^}}test_control_flow_0: 344 ;CHECK-NEXT: ; %main_body 345 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 346 ;CHECK-NEXT: s_wqm_b64 exec, exec 347 ;CHECK: %ELSE 348 ;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 349 ;CHECK: store 350 ;CHECK: s_mov_b64 exec, [[SAVED]] 351 ;CHECK: %IF 352 ;CHECK: image_sample 353 ;CHECK: image_sample 354 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 355 main_body: 356 %cmp = icmp eq i32 %z, 0 357 br i1 %cmp, label %IF, label %ELSE 358 359 IF: 360 %c.bc = bitcast i32 %c to float 361 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 362 %tex0 = extractelement <4 x float> %tex, i32 0 363 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 364 %data.if = extractelement <4 x float> %dtex, i32 0 365 br label %END 366 367 ELSE: 368 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) 369 br label %END 370 371 END: 372 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 373 ret float %r 374 } 375 376 ; Reverse branch order compared to the previous test. 377 ; 378 ;CHECK-LABEL: {{^}}test_control_flow_1: 379 ;CHECK-NEXT: ; %main_body 380 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 381 ;CHECK-NEXT: s_wqm_b64 exec, exec 382 ;CHECK: %IF 383 ;CHECK: image_sample 384 ;CHECK: image_sample 385 ;CHECK: %Flow 386 ;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], 387 ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] 388 ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] 389 ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] 390 ;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]] 391 ;CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %ELSE 392 ;CHECK: store_dword 393 ;CHECK: [[END_BB]]: ; %END 394 ;CHECK: s_or_b64 exec, exec, 395 ;CHECK: v_mov_b32_e32 v0 396 ;CHECK: ; return 397 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 398 main_body: 399 %cmp = icmp eq i32 %z, 0 400 br i1 %cmp, label %ELSE, label %IF 401 402 IF: 403 %c.bc = bitcast i32 %c to float 404 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 405 %tex0 = extractelement <4 x float> %tex, i32 0 406 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 407 %data.if = extractelement <4 x float> %dtex, i32 0 408 br label %END 409 410 ELSE: 411 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) 412 br label %END 413 414 END: 415 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 416 ret float %r 417 } 418 419 ; Check that branch conditions are properly marked as needing WQM... 420 ; 421 ;CHECK-LABEL: {{^}}test_control_flow_2: 422 ;CHECK-NEXT: ; %main_body 423 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 424 ;CHECK-NEXT: s_wqm_b64 exec, exec 425 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 426 ;CHECK: store 427 ;CHECK: s_wqm_b64 exec, exec 428 ;CHECK: load 429 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 430 ;CHECK: store 431 ;CHECK: s_wqm_b64 exec, exec 432 ;CHECK: v_cmp 433 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { 434 main_body: 435 %idx.1 = extractelement <3 x i32> %idx, i32 0 436 %data.1 = extractelement <2 x float> %data, i32 0 437 call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) 438 439 ; The load that determines the branch (and should therefore be WQM) is 440 ; surrounded by stores that require disabled WQM. 441 %idx.2 = extractelement <3 x i32> %idx, i32 1 442 %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0) 443 444 %idx.3 = extractelement <3 x i32> %idx, i32 2 445 %data.3 = extractelement <2 x float> %data, i32 1 446 call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0) 447 448 %cc = fcmp ogt float %z, 0.0 449 br i1 %cc, label %IF, label %ELSE 450 451 IF: 452 %coord.IF = mul i32 %coord, 3 453 br label %END 454 455 ELSE: 456 %coord.ELSE = mul i32 %coord, 4 457 br label %END 458 459 END: 460 %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] 461 %coord.END.bc = bitcast i32 %coord.END to float 462 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 463 ret <4 x float> %tex 464 } 465 466 ; ... but only if they really do need it. 467 ; 468 ;CHECK-LABEL: {{^}}test_control_flow_3: 469 ;CHECK-NEXT: ; %main_body 470 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 471 ;CHECK-NEXT: s_wqm_b64 exec, exec 472 ;CHECK: image_sample 473 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 474 ;CHECK: image_sample 475 ;CHECK-DAG: v_cmp 476 ;CHECK-DAG: store 477 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { 478 main_body: 479 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 480 %tex0 = extractelement <4 x float> %tex, i32 0 481 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 482 %dtex.1 = extractelement <4 x float> %dtex, i32 0 483 call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) 484 485 %cc = fcmp ogt float %dtex.1, 0.0 486 br i1 %cc, label %IF, label %ELSE 487 488 IF: 489 %tex.IF = fmul float %dtex.1, 3.0 490 br label %END 491 492 ELSE: 493 %tex.ELSE = fmul float %dtex.1, 4.0 494 br label %END 495 496 END: 497 %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ] 498 ret float %tex.END 499 } 500 501 ; Another test that failed at some point because of terminator handling. 502 ; 503 ;CHECK-LABEL: {{^}}test_control_flow_4: 504 ;CHECK-NEXT: ; %main_body 505 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 506 ;CHECK-NEXT: s_wqm_b64 exec, exec 507 ;CHECK: %IF 508 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 509 ;CHECK: load 510 ;CHECK: store 511 ;CHECK: s_mov_b64 exec, [[SAVE]] 512 ;CHECK: %END 513 ;CHECK: image_sample 514 ;CHECK: image_sample 515 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { 516 main_body: 517 %cond = icmp eq i32 %y, 0 518 br i1 %cond, label %IF, label %END 519 520 IF: 521 %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0) 522 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) 523 br label %END 524 525 END: 526 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 527 %tex0 = extractelement <4 x float> %tex, i32 0 528 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 529 ret <4 x float> %dtex 530 } 531 532 ; Kill is performed in WQM mode so that uniform kill behaves correctly ... 533 ; 534 ;CHECK-LABEL: {{^}}test_kill_0: 535 ;CHECK-NEXT: ; %main_body 536 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 537 ;CHECK-NEXT: s_wqm_b64 exec, exec 538 ;CHECK: s_and_b64 exec, exec, [[ORIG]] 539 ;CHECK: image_sample 540 ;CHECK: buffer_store_dword 541 ;CHECK: s_wqm_b64 exec, exec 542 ;CHECK: v_cmpx_ 543 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 544 ;CHECK: buffer_store_dword 545 ;CHECK: s_mov_b64 exec, [[SAVE]] 546 ;CHECK: image_sample 547 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { 548 main_body: 549 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 550 %idx.0 = extractelement <2 x i32> %idx, i32 0 551 %data.0 = extractelement <2 x float> %data, i32 0 552 call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0) 553 554 call void @llvm.AMDGPU.kill(float %z) 555 556 %idx.1 = extractelement <2 x i32> %idx, i32 1 557 %data.1 = extractelement <2 x float> %data, i32 1 558 call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) 559 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 560 %tex2.0 = extractelement <4 x float> %tex2, i32 0 561 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 562 %out = fadd <4 x float> %tex, %dtex 563 564 ret <4 x float> %out 565 } 566 567 ; ... but only if WQM is necessary. 568 ; 569 ; CHECK-LABEL: {{^}}test_kill_1: 570 ; CHECK-NEXT: ; %main_body 571 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 572 ; CHECK: s_wqm_b64 exec, exec 573 ; CHECK: image_sample 574 ; CHECK: s_and_b64 exec, exec, [[ORIG]] 575 ; CHECK: image_sample 576 ; CHECK: buffer_store_dword 577 ; CHECK-NOT: wqm 578 ; CHECK: v_cmpx_ 579 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 580 main_body: 581 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 582 %tex0 = extractelement <4 x float> %tex, i32 0 583 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 584 585 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) 586 587 call void @llvm.AMDGPU.kill(float %z) 588 589 ret <4 x float> %dtex 590 } 591 592 ; Check prolog shaders. 593 ; 594 ; CHECK-LABEL: {{^}}test_prolog_1: 595 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 596 ; CHECK: s_wqm_b64 exec, exec 597 ; CHECK: v_add_f32_e32 v0, 598 ; CHECK: s_and_b64 exec, exec, [[ORIG]] 599 define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 { 600 main_body: 601 %s = fadd float %a, %b 602 ret float %s 603 } 604 605 ; CHECK-LABEL: {{^}}test_loop_vcc: 606 ; CHECK-NEXT: ; %entry 607 ; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 608 ; CHECK: s_wqm_b64 exec, exec 609 ; CHECK: s_and_b64 exec, exec, [[LIVE]] 610 ; CHECK: image_store 611 ; CHECK: s_wqm_b64 exec, exec 612 ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0 613 ; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000 614 615 ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body 616 ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] 617 ; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]] 618 ; CHECK: s_cbranch_vccz [[LOOPHDR]] 619 ; CHECK: ; %break 620 621 ; CHECK: ; return 622 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { 623 entry: 624 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0) 625 br label %loop 626 627 loop: 628 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] 629 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] 630 %cc = fcmp ogt float %ctr.iv, 7.0 631 br i1 %cc, label %break, label %body 632 633 body: 634 %c.iv0 = extractelement <4 x float> %c.iv, i32 0 635 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 636 %ctr.next = fadd float %ctr.iv, 2.0 637 br label %loop 638 639 break: 640 ret <4 x float> %c.iv 641 } 642 643 ; Only intrinsic stores need exact execution -- other stores do not have 644 ; externally visible effects and may require WQM for correctness. 645 ; 646 ; CHECK-LABEL: {{^}}test_alloca: 647 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 648 ; CHECK: s_wqm_b64 exec, exec 649 650 ; CHECK: s_and_b64 exec, exec, [[LIVE]] 651 ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 652 ; CHECK: s_wqm_b64 exec, exec 653 ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4{{$}} 654 ; CHECK: s_and_b64 exec, exec, [[LIVE]] 655 ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen 656 ; CHECK: s_wqm_b64 exec, exec 657 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen 658 659 ; CHECK: s_and_b64 exec, exec, [[LIVE]] 660 ; CHECK: image_sample 661 ; CHECK: buffer_store_dwordx4 662 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { 663 entry: 664 %array = alloca [32 x i32], align 4, addrspace(5) 665 666 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) 667 668 %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0 669 store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4 670 671 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) 672 673 %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx 674 %c = load i32, i32 addrspace(5)* %c.gep, align 4 675 %c.bc = bitcast i32 %c to float 676 %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 677 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) 678 679 ret void 680 } 681 682 ; Must return to exact at the end of a non-void returning shader, 683 ; otherwise the EXEC mask exported by the epilog will be wrong. This is true 684 ; even if the shader has no kills, because a kill could have happened in a 685 ; previous shader fragment. 686 ; 687 ; CHECK-LABEL: {{^}}test_nonvoid_return: 688 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 689 ; CHECK: s_wqm_b64 exec, exec 690 ; 691 ; CHECK: s_and_b64 exec, exec, [[LIVE]] 692 ; CHECK-NOT: exec 693 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { 694 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 695 %tex0 = extractelement <4 x float> %tex, i32 0 696 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 697 ret <4 x float> %dtex 698 } 699 700 ; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable: 701 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 702 ; CHECK: s_wqm_b64 exec, exec 703 ; 704 ; CHECK: s_and_b64 exec, exec, [[LIVE]] 705 ; CHECK-NOT: exec 706 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { 707 entry: 708 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 709 %tex0 = extractelement <4 x float> %tex, i32 0 710 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 711 %cc = icmp sgt i32 %c, 0 712 br i1 %cc, label %if, label %else 713 714 if: 715 store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef 716 unreachable 717 718 else: 719 ret <4 x float> %dtex 720 } 721 722 ; Test awareness that s_wqm_b64 clobbers SCC. 723 ; 724 ; CHECK-LABEL: {{^}}test_scc: 725 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 726 ; CHECK: s_wqm_b64 exec, exec 727 ; CHECK: s_cmp_ 728 ; CHECK-NEXT: s_cbranch_scc 729 ; CHECK: ; %if 730 ; CHECK: s_and_b64 exec, exec, [[ORIG]] 731 ; CHECK: image_sample 732 ; CHECK: ; %else 733 ; CHECK: s_and_b64 exec, exec, [[ORIG]] 734 ; CHECK: image_sample 735 ; CHECK: ; %end 736 define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { 737 main_body: 738 %cc = icmp sgt i32 %sel, 0 739 br i1 %cc, label %if, label %else 740 741 if: 742 %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 743 br label %end 744 745 else: 746 %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 747 br label %end 748 749 end: 750 %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] 751 call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) 752 ret <4 x float> %r 753 } 754 755 ; Check a case of a block being entirely WQM except for a bit of WWM. 756 ; There was a bug where it forgot to enter and leave WWM. 757 ; 758 ;CHECK-LABEL: {{^}}test_wwm_within_wqm: 759 ;CHECK: %IF 760 ;CHECK: s_or_saveexec_b64 {{.*}}, -1 761 ;CHECK: ds_swizzle 762 ; 763 define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 764 main_body: 765 %c.bc = bitcast i32 %c to float 766 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 767 %tex0 = extractelement <4 x float> %tex, i32 0 768 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 769 %cmp = icmp eq i32 %z, 0 770 br i1 %cmp, label %IF, label %ENDIF 771 772 IF: 773 %dataf = extractelement <4 x float> %dtex, i32 0 774 %data1 = fptosi float %dataf to i32 775 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) 776 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) 777 %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3) 778 %data4f = sitofp i32 %data4 to float 779 br label %ENDIF 780 781 ENDIF: 782 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] 783 ret float %r 784 } 785 786 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 787 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 788 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2 789 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2 790 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3 791 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3 792 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 793 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 794 declare void @llvm.AMDGPU.kill(float) #1 795 declare float @llvm.amdgcn.wqm.f32(float) #3 796 declare i32 @llvm.amdgcn.wqm.i32(i32) #3 797 declare float @llvm.amdgcn.wwm.f32(float) #3 798 declare i32 @llvm.amdgcn.wwm.i32(i32) #3 799 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 800 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 801 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 802 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3 803 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1 804 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 805 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 806 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) 807 808 attributes #1 = { nounwind } 809 attributes #2 = { nounwind readonly } 810 attributes #3 = { nounwind readnone } 811 attributes #4 = { nounwind readnone convergent } 812 attributes #5 = { "amdgpu-ps-wqm-outputs" } 813 attributes #6 = { nounwind "InitialPSInputAddr"="2" } 814