Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s
      3 
      4 ; Check that WQM isn't triggered by image load/store intrinsics.
      5 ;
      6 ;CHECK-LABEL: {{^}}test1:
      7 ;CHECK-NOT: s_wqm
      8 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
      9 main_body:
     10   %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
     11   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
     12   ret <4 x float> %tex
     13 }
     14 
     15 ; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
     16 ;
     17 ;CHECK-LABEL: {{^}}test2:
     18 ;CHECK-NEXT: ; %main_body
     19 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
     20 ;CHECK-NEXT: s_wqm_b64 exec, exec
     21 ;CHECK: interp
     22 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
     23 ;CHECK-NOT: interp
     24 ;CHECK: image_sample
     25 ;CHECK-NOT: exec
     26 ;CHECK: .size test2
     27 define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
     28 main_body:
     29   %inst23 = extractelement <2 x float> %pos, i32 0
     30   %inst24 = extractelement <2 x float> %pos, i32 1
     31   %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
     32   %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
     33   %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
     34   %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
     35   %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
     36   ret <4 x float> %tex
     37 }
     38 
     39 ; ... but disabled for stores (and, in this simple case, not re-enabled) ...
     40 ;
     41 ;CHECK-LABEL: {{^}}test3:
     42 ;CHECK-NEXT: ; %main_body
     43 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
     44 ;CHECK-NEXT: s_wqm_b64 exec, exec
     45 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
     46 ;CHECK: image_sample
     47 ;CHECK: store
     48 ;CHECK-NOT: exec
     49 ;CHECK: .size test3
     50 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
     51 main_body:
     52   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
     53   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
     54   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
     55 
     56   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0)
     57 
     58   ret <4 x float> %tex
     59 }
     60 
     61 ; ... and disabled for export.
     62 ;
     63 ;CHECK-LABEL: {{^}}test3x:
     64 ;CHECK-NEXT: ; %main_body
     65 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
     66 ;CHECK-NEXT: s_wqm_b64 exec, exec
     67 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
     68 ;CHECK: image_sample
     69 ;CHECK: exp
     70 ;CHECK-NOT: exec
     71 ;CHECK: .size test3x
     72 define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
     73 main_body:
     74   %inst23 = extractelement <2 x float> %pos, i32 0
     75   %inst24 = extractelement <2 x float> %pos, i32 1
     76   %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
     77   %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
     78   %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
     79   %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
     80   %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
     81   %tex.0 = extractelement <4 x float> %tex, i32 0
     82   %tex.1 = extractelement <4 x float> %tex, i32 1
     83   %tex.2 = extractelement <4 x float> %tex, i32 2
     84   %tex.3 = extractelement <4 x float> %tex, i32 3
     85   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
     86   ret void
     87 }
     88 
     89 ; Check that WQM is re-enabled when required.
     90 ;
     91 ;CHECK-LABEL: {{^}}test4:
     92 ;CHECK-NEXT: ; %main_body
     93 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
     94 ;CHECK-NEXT: s_wqm_b64 exec, exec
     95 ;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
     96 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
     97 ;CHECK: store
     98 ;CHECK: s_wqm_b64 exec, exec
     99 ;CHECK: image_sample
    100 ;CHECK: image_sample
    101 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
    102 main_body:
    103   %c.1 = mul i32 %c, %d
    104 
    105   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
    106   %c.1.bc = bitcast i32 %c.1 to float
    107   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    108   %tex0 = extractelement <4 x float> %tex, i32 0
    109   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    110   ret <4 x float> %dtex
    111 }
    112 
    113 ; Check that WQM is triggered by the wqm intrinsic.
    114 ;
    115 ;CHECK-LABEL: {{^}}test5:
    116 ;CHECK: s_wqm_b64 exec, exec
    117 ;CHECK: buffer_load_dword
    118 ;CHECK: buffer_load_dword
    119 ;CHECK: v_add_f32_e32
    120 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
    121 main_body:
    122   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
    123   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
    124   %out = fadd float %src0, %src1
    125   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
    126   ret float %out.0
    127 }
    128 
    129 ; Check that the wqm intrinsic works correctly for integers.
    130 ;
    131 ;CHECK-LABEL: {{^}}test6:
    132 ;CHECK: s_wqm_b64 exec, exec
    133 ;CHECK: buffer_load_dword
    134 ;CHECK: buffer_load_dword
    135 ;CHECK: v_add_f32_e32
    136 define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
    137 main_body:
    138   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
    139   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
    140   %out = fadd float %src0, %src1
    141   %out.0 = bitcast float %out to i32
    142   %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
    143   %out.2 = bitcast i32 %out.1 to float
    144   ret float %out.2
    145 }
    146 
    147 ; Check that WWM is triggered by the wwm intrinsic.
    148 ;
    149 ;CHECK-LABEL: {{^}}test_wwm1:
    150 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
    151 ;CHECK: buffer_load_dword
    152 ;CHECK: buffer_load_dword
    153 ;CHECK: v_add_f32_e32
    154 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
    155 main_body:
    156   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
    157   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
    158   %out = fadd float %src0, %src1
    159   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
    160   ret float %out.0
    161 }
    162 
    163 ; Same as above, but with an integer type.
    164 ;
    165 ;CHECK-LABEL: {{^}}test_wwm2:
    166 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
    167 ;CHECK: buffer_load_dword
    168 ;CHECK: buffer_load_dword
    169 ;CHECK: v_add_{{[iu]}}32_e32
    170 define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
    171 main_body:
    172   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
    173   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
    174   %src0.0 = bitcast float %src0 to i32
    175   %src1.0 = bitcast float %src1 to i32
    176   %out = add i32 %src0.0, %src1.0
    177   %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
    178   %out.1 = bitcast i32 %out.0 to float
    179   ret float %out.1
    180 }
    181 
    182 ; Check that we don't leave WWM on for computations that don't require WWM,
    183 ; since that will lead clobbering things that aren't supposed to be clobbered
    184 ; in cases like this.
    185 ;
    186 ;CHECK-LABEL: {{^}}test_wwm3:
    187 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
    188 ;CHECK: buffer_load_dword
    189 ;CHECK: v_add_f32_e32
    190 ;CHECK: s_mov_b64 exec, [[ORIG]]
    191 ;CHECK: v_add_f32_e32
    192 define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
    193 main_body:
    194   ; use mbcnt to make sure the branch is divergent
    195   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
    196   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
    197   %cc = icmp uge i32 %hi, 32
    198   br i1 %cc, label %endif, label %if
    199 
    200 if:
    201   %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
    202   %out = fadd float %src, %src
    203   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
    204   %out.1 = fadd float %src, %out.0
    205   br label %endif
    206 
    207 endif:
    208   %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
    209   ret float %out.2
    210 }
    211 
    212 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
    213 ; write could clobber disabled channels in the non-WWM one.
    214 ;
    215 ;CHECK-LABEL: {{^}}test_wwm4:
    216 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
    217 ;CHECK: buffer_load_dword
    218 ;CHECK: v_add_f32_e32
    219 ;CHECK: s_mov_b64 exec, [[ORIG]]
    220 ;CHECK-NEXT: v_mov_b32_e32
    221 define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
    222 main_body:
    223   ; use mbcnt to make sure the branch is divergent
    224   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
    225   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
    226   %cc = icmp uge i32 %hi, 32
    227   br i1 %cc, label %endif, label %if
    228 
    229 if:
    230   %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
    231   %out = fadd float %src, %src
    232   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
    233   br label %endif
    234 
    235 endif:
    236   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
    237   ret float %out.1
    238 }
    239 
    240 ; Make sure the transition from Exact to WWM then WQM works properly.
    241 ;
    242 ;CHECK-LABEL: {{^}}test_wwm5:
    243 ;CHECK: buffer_load_dword
    244 ;CHECK: buffer_store_dword
    245 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
    246 ;CHECK: buffer_load_dword
    247 ;CHECK: v_add_f32_e32
    248 ;CHECK: s_mov_b64 exec, [[ORIG]]
    249 ;CHECK: s_wqm_b64 exec, exec
    250 define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
    251 main_body:
    252   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
    253   call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
    254   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
    255   %temp = fadd float %src1, %src1
    256   %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
    257   %out = fadd float %temp.0, %temp.0
    258   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
    259   ret float %out.0
    260 }
    261 
    262 ; Check that WWM is turned on correctly across basic block boundaries.
    263 ;
    264 ;CHECK-LABEL: {{^}}test_wwm6:
    265 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
    266 ;SI-CHECK: buffer_load_dword
    267 ;VI-CHECK: flat_load_dword
    268 ;CHECK: s_mov_b64 exec, [[ORIG]]
    269 ;CHECK: %if
    270 ;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
    271 ;SI-CHECK: buffer_load_dword
    272 ;VI-CHECK: flat_load_dword
    273 ;CHECK: v_add_f32_e32
    274 ;CHECK: s_mov_b64 exec, [[ORIG2]]
    275 define amdgpu_ps float @test_wwm6() {
    276 main_body:
    277   %src0 = load volatile float, float addrspace(1)* undef
    278   ; use mbcnt to make sure the branch is divergent
    279   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
    280   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
    281   %cc = icmp uge i32 %hi, 32
    282   br i1 %cc, label %endif, label %if
    283 
    284 if:
    285   %src1 = load volatile float, float addrspace(1)* undef
    286   %out = fadd float %src0, %src1
    287   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
    288   br label %endif
    289 
    290 endif:
    291   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
    292   ret float %out.1
    293 }
    294 
    295 ; Check that @llvm.amdgcn.set.inactive disables WWM.
    296 ;
    297 ;CHECK-LABEL: {{^}}test_set_inactive1:
    298 ;CHECK: buffer_load_dword
    299 ;CHECK: s_not_b64 exec, exec
    300 ;CHECK: v_mov_b32_e32
    301 ;CHECK: s_not_b64 exec, exec
    302 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
    303 ;CHECK: v_add_{{[iu]}}32_e32
    304 define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) {
    305 main_body:
    306   %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
    307   %src.0 = bitcast float %src to i32
    308   %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
    309   %out = add i32 %src.1, %src.1
    310   %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
    311   %out.1 = bitcast i32 %out.0 to float
    312   call void @llvm.amdgcn.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
    313   ret void
    314 }
    315 
    316 ; Check that enabling WQM anywhere enables WQM for the set.inactive source.
    317 ;
    318 ;CHECK-LABEL: {{^}}test_set_inactive2:
    319 ;CHECK: s_wqm_b64 exec, exec
    320 ;CHECK: buffer_load_dword
    321 ;CHECK: buffer_load_dword
    322 define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
    323 main_body:
    324   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
    325   %src1.0 = bitcast float %src1 to i32
    326   %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
    327   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
    328   %src0.0 = bitcast float %src0 to i32
    329   %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
    330   %out = add i32 %src0.1, %src1.1
    331   %out.0 = bitcast i32 %out to float
    332   call void @llvm.amdgcn.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
    333   ret void
    334 }
    335 
    336 ; Check a case of one branch of an if-else requiring WQM, the other requiring
    337 ; exact.
    338 ;
    339 ; Note: In this particular case, the save-and-restore could be avoided if the
    340 ; analysis understood that the two branches of the if-else are mutually
    341 ; exclusive.
    342 ;
    343 ;CHECK-LABEL: {{^}}test_control_flow_0:
    344 ;CHECK-NEXT: ; %main_body
    345 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    346 ;CHECK-NEXT: s_wqm_b64 exec, exec
    347 ;CHECK: %ELSE
    348 ;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
    349 ;CHECK: store
    350 ;CHECK: s_mov_b64 exec, [[SAVED]]
    351 ;CHECK: %IF
    352 ;CHECK: image_sample
    353 ;CHECK: image_sample
    354 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
    355 main_body:
    356   %cmp = icmp eq i32 %z, 0
    357   br i1 %cmp, label %IF, label %ELSE
    358 
    359 IF:
    360   %c.bc = bitcast i32 %c to float
    361   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    362   %tex0 = extractelement <4 x float> %tex, i32 0
    363   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    364   %data.if = extractelement <4 x float> %dtex, i32 0
    365   br label %END
    366 
    367 ELSE:
    368   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
    369   br label %END
    370 
    371 END:
    372   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
    373   ret float %r
    374 }
    375 
    376 ; Reverse branch order compared to the previous test.
    377 ;
    378 ;CHECK-LABEL: {{^}}test_control_flow_1:
    379 ;CHECK-NEXT: ; %main_body
    380 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    381 ;CHECK-NEXT: s_wqm_b64 exec, exec
    382 ;CHECK: %IF
    383 ;CHECK: image_sample
    384 ;CHECK: image_sample
    385 ;CHECK: %Flow
    386 ;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
    387 ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
    388 ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
    389 ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
    390 ;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
    391 ;CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %ELSE
    392 ;CHECK: store_dword
    393 ;CHECK: [[END_BB]]: ; %END
    394 ;CHECK: s_or_b64 exec, exec,
    395 ;CHECK: v_mov_b32_e32 v0
    396 ;CHECK: ; return
    397 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
    398 main_body:
    399   %cmp = icmp eq i32 %z, 0
    400   br i1 %cmp, label %ELSE, label %IF
    401 
    402 IF:
    403   %c.bc = bitcast i32 %c to float
    404   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    405   %tex0 = extractelement <4 x float> %tex, i32 0
    406   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    407   %data.if = extractelement <4 x float> %dtex, i32 0
    408   br label %END
    409 
    410 ELSE:
    411   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
    412   br label %END
    413 
    414 END:
    415   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
    416   ret float %r
    417 }
    418 
    419 ; Check that branch conditions are properly marked as needing WQM...
    420 ;
    421 ;CHECK-LABEL: {{^}}test_control_flow_2:
    422 ;CHECK-NEXT: ; %main_body
    423 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    424 ;CHECK-NEXT: s_wqm_b64 exec, exec
    425 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
    426 ;CHECK: store
    427 ;CHECK: s_wqm_b64 exec, exec
    428 ;CHECK: load
    429 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
    430 ;CHECK: store
    431 ;CHECK: s_wqm_b64 exec, exec
    432 ;CHECK: v_cmp
    433 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
    434 main_body:
    435   %idx.1 = extractelement <3 x i32> %idx, i32 0
    436   %data.1 = extractelement <2 x float> %data, i32 0
    437   call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
    438 
    439   ; The load that determines the branch (and should therefore be WQM) is
    440   ; surrounded by stores that require disabled WQM.
    441   %idx.2 = extractelement <3 x i32> %idx, i32 1
    442   %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
    443 
    444   %idx.3 = extractelement <3 x i32> %idx, i32 2
    445   %data.3 = extractelement <2 x float> %data, i32 1
    446   call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
    447 
    448   %cc = fcmp ogt float %z, 0.0
    449   br i1 %cc, label %IF, label %ELSE
    450 
    451 IF:
    452   %coord.IF = mul i32 %coord, 3
    453   br label %END
    454 
    455 ELSE:
    456   %coord.ELSE = mul i32 %coord, 4
    457   br label %END
    458 
    459 END:
    460   %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
    461   %coord.END.bc = bitcast i32 %coord.END to float
    462   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    463   ret <4 x float> %tex
    464 }
    465 
    466 ; ... but only if they really do need it.
    467 ;
    468 ;CHECK-LABEL: {{^}}test_control_flow_3:
    469 ;CHECK-NEXT: ; %main_body
    470 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    471 ;CHECK-NEXT: s_wqm_b64 exec, exec
    472 ;CHECK: image_sample
    473 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
    474 ;CHECK: image_sample
    475 ;CHECK-DAG: v_cmp
    476 ;CHECK-DAG: store
    477 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
    478 main_body:
    479   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    480   %tex0 = extractelement <4 x float> %tex, i32 0
    481   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    482   %dtex.1 = extractelement <4 x float> %dtex, i32 0
    483   call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
    484 
    485   %cc = fcmp ogt float %dtex.1, 0.0
    486   br i1 %cc, label %IF, label %ELSE
    487 
    488 IF:
    489   %tex.IF = fmul float %dtex.1, 3.0
    490   br label %END
    491 
    492 ELSE:
    493   %tex.ELSE = fmul float %dtex.1, 4.0
    494   br label %END
    495 
    496 END:
    497   %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
    498   ret float %tex.END
    499 }
    500 
    501 ; Another test that failed at some point because of terminator handling.
    502 ;
    503 ;CHECK-LABEL: {{^}}test_control_flow_4:
    504 ;CHECK-NEXT: ; %main_body
    505 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    506 ;CHECK-NEXT: s_wqm_b64 exec, exec
    507 ;CHECK: %IF
    508 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
    509 ;CHECK: load
    510 ;CHECK: store
    511 ;CHECK: s_mov_b64 exec, [[SAVE]]
    512 ;CHECK: %END
    513 ;CHECK: image_sample
    514 ;CHECK: image_sample
    515 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
    516 main_body:
    517   %cond = icmp eq i32 %y, 0
    518   br i1 %cond, label %IF, label %END
    519 
    520 IF:
    521   %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
    522   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
    523   br label %END
    524 
    525 END:
    526   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    527   %tex0 = extractelement <4 x float> %tex, i32 0
    528   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    529   ret <4 x float> %dtex
    530 }
    531 
    532 ; Kill is performed in WQM mode so that uniform kill behaves correctly ...
    533 ;
    534 ;CHECK-LABEL: {{^}}test_kill_0:
    535 ;CHECK-NEXT: ; %main_body
    536 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    537 ;CHECK-NEXT: s_wqm_b64 exec, exec
    538 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
    539 ;CHECK: image_sample
    540 ;CHECK: buffer_store_dword
    541 ;CHECK: s_wqm_b64 exec, exec
    542 ;CHECK: v_cmpx_
    543 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
    544 ;CHECK: buffer_store_dword
    545 ;CHECK: s_mov_b64 exec, [[SAVE]]
    546 ;CHECK: image_sample
    547 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
    548 main_body:
    549   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    550   %idx.0 = extractelement <2 x i32> %idx, i32 0
    551   %data.0 = extractelement <2 x float> %data, i32 0
    552   call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)
    553 
    554   call void @llvm.AMDGPU.kill(float %z)
    555 
    556   %idx.1 = extractelement <2 x i32> %idx, i32 1
    557   %data.1 = extractelement <2 x float> %data, i32 1
    558   call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
    559   %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    560   %tex2.0 = extractelement <4 x float> %tex2, i32 0
    561   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    562   %out = fadd <4 x float> %tex, %dtex
    563 
    564   ret <4 x float> %out
    565 }
    566 
    567 ; ... but only if WQM is necessary.
    568 ;
    569 ; CHECK-LABEL: {{^}}test_kill_1:
    570 ; CHECK-NEXT: ; %main_body
    571 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    572 ; CHECK: s_wqm_b64 exec, exec
    573 ; CHECK: image_sample
    574 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
    575 ; CHECK: image_sample
    576 ; CHECK: buffer_store_dword
    577 ; CHECK-NOT: wqm
    578 ; CHECK: v_cmpx_
    579 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
    580 main_body:
    581   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    582   %tex0 = extractelement <4 x float> %tex, i32 0
    583   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    584 
    585   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
    586 
    587   call void @llvm.AMDGPU.kill(float %z)
    588 
    589   ret <4 x float> %dtex
    590 }
    591 
    592 ; Check prolog shaders.
    593 ;
    594 ; CHECK-LABEL: {{^}}test_prolog_1:
    595 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    596 ; CHECK: s_wqm_b64 exec, exec
    597 ; CHECK: v_add_f32_e32 v0,
    598 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
    599 define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
    600 main_body:
    601   %s = fadd float %a, %b
    602   ret float %s
    603 }
    604 
    605 ; CHECK-LABEL: {{^}}test_loop_vcc:
    606 ; CHECK-NEXT: ; %entry
    607 ; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
    608 ; CHECK: s_wqm_b64 exec, exec
    609 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
    610 ; CHECK: image_store
    611 ; CHECK: s_wqm_b64 exec, exec
    612 ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
    613 ; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000
    614 
    615 ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
    616 ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
    617 ; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]]
    618 ; CHECK: s_cbranch_vccz [[LOOPHDR]]
    619 ; CHECK: ; %break
    620 
    621 ; CHECK: ; return
    622 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
    623 entry:
    624   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
    625   br label %loop
    626 
    627 loop:
    628   %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
    629   %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
    630   %cc = fcmp ogt float %ctr.iv, 7.0
    631   br i1 %cc, label %break, label %body
    632 
    633 body:
    634   %c.iv0 = extractelement <4 x float> %c.iv, i32 0
    635   %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0
    636   %ctr.next = fadd float %ctr.iv, 2.0
    637   br label %loop
    638 
    639 break:
    640   ret <4 x float> %c.iv
    641 }
    642 
    643 ; Only intrinsic stores need exact execution -- other stores do not have
    644 ; externally visible effects and may require WQM for correctness.
    645 ;
    646 ; CHECK-LABEL: {{^}}test_alloca:
    647 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
    648 ; CHECK: s_wqm_b64 exec, exec
    649 
    650 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
    651 ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
    652 ; CHECK: s_wqm_b64 exec, exec
    653 ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4{{$}}
    654 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
    655 ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
    656 ; CHECK: s_wqm_b64 exec, exec
    657 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
    658 
    659 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
    660 ; CHECK: image_sample
    661 ; CHECK: buffer_store_dwordx4
    662 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
    663 entry:
    664   %array = alloca [32 x i32], align 4, addrspace(5)
    665 
    666   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
    667 
    668   %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0
    669   store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4
    670 
    671   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
    672 
    673   %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx
    674   %c = load i32, i32 addrspace(5)* %c.gep, align 4
    675   %c.bc = bitcast i32 %c to float
    676   %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0
    677   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
    678 
    679   ret void
    680 }
    681 
    682 ; Must return to exact at the end of a non-void returning shader,
    683 ; otherwise the EXEC mask exported by the epilog will be wrong. This is true
    684 ; even if the shader has no kills, because a kill could have happened in a
    685 ; previous shader fragment.
    686 ;
    687 ; CHECK-LABEL: {{^}}test_nonvoid_return:
    688 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
    689 ; CHECK: s_wqm_b64 exec, exec
    690 ;
    691 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
    692 ; CHECK-NOT: exec
    693 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
    694   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0
    695   %tex0 = extractelement <4 x float> %tex, i32 0
    696   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0
    697   ret <4 x float> %dtex
    698 }
    699 
    700 ; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
    701 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
    702 ; CHECK: s_wqm_b64 exec, exec
    703 ;
    704 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
    705 ; CHECK-NOT: exec
    706 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
    707 entry:
    708   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0
    709   %tex0 = extractelement <4 x float> %tex, i32 0
    710   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0
    711   %cc = icmp sgt i32 %c, 0
    712   br i1 %cc, label %if, label %else
    713 
    714 if:
    715   store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
    716   unreachable
    717 
    718 else:
    719   ret <4 x float> %dtex
    720 }
    721 
    722 ; Test awareness that s_wqm_b64 clobbers SCC.
    723 ;
    724 ; CHECK-LABEL: {{^}}test_scc:
    725 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
    726 ; CHECK: s_wqm_b64 exec, exec
    727 ; CHECK: s_cmp_
    728 ; CHECK-NEXT: s_cbranch_scc
    729 ; CHECK: ; %if
    730 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
    731 ; CHECK: image_sample
    732 ; CHECK: ; %else
    733 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
    734 ; CHECK: image_sample
    735 ; CHECK: ; %end
    736 define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
    737 main_body:
    738   %cc = icmp sgt i32 %sel, 0
    739   br i1 %cc, label %if, label %else
    740 
    741 if:
    742   %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0
    743   br label %end
    744 
    745 else:
    746   %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0
    747   br label %end
    748 
    749 end:
    750   %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
    751   call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
    752   ret <4 x float> %r
    753 }
    754 
    755 ; Check a case of a block being entirely WQM except for a bit of WWM.
    756 ; There was a bug where it forgot to enter and leave WWM.
    757 ;
    758 ;CHECK-LABEL: {{^}}test_wwm_within_wqm:
    759 ;CHECK: %IF
    760 ;CHECK: s_or_saveexec_b64 {{.*}}, -1
    761 ;CHECK: ds_swizzle
    762 ;
    763 define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
    764 main_body:
    765   %c.bc = bitcast i32 %c to float
    766   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    767   %tex0 = extractelement <4 x float> %tex, i32 0
    768   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
    769   %cmp = icmp eq i32 %z, 0
    770   br i1 %cmp, label %IF, label %ENDIF
    771 
    772 IF:
    773   %dataf = extractelement <4 x float> %dtex, i32 0
    774   %data1 = fptosi float %dataf to i32
    775   %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
    776   %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
    777   %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
    778   %data4f = sitofp i32 %data4 to float
    779   br label %ENDIF
    780 
    781 ENDIF:
    782   %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
    783   ret float %r
    784 }
    785 
    786 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
    787 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
    788 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2
    789 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
    790 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
    791 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
    792 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
    793 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
    794 declare void @llvm.AMDGPU.kill(float) #1
    795 declare float @llvm.amdgcn.wqm.f32(float) #3
    796 declare i32 @llvm.amdgcn.wqm.i32(i32) #3
    797 declare float @llvm.amdgcn.wwm.f32(float) #3
    798 declare i32 @llvm.amdgcn.wwm.i32(i32) #3
    799 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
    800 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
    801 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
    802 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
    803 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
    804 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
    805 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
    806 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
    807 
    808 attributes #1 = { nounwind }
    809 attributes #2 = { nounwind readonly }
    810 attributes #3 = { nounwind readnone }
    811 attributes #4 = { nounwind readnone convergent }
    812 attributes #5 = { "amdgpu-ps-wqm-outputs" }
    813 attributes #6 = { nounwind "InitialPSInputAddr"="2" }
    814